# Titanic spaceship kaggle competition

## Initialisation

In [1]:
import numpy as np
import pandas as pd
import kaggle
import os
import json
import time
from titanic_spaceship_package import dataframe_preprocessing, get_pipeline

In [2]:
REPO_DATA = 'data'
REPO_DATA_PREPROCESSED = 'data_preprocessed'
REPO_MODEL = 'model'
REPO_SUBMISSION = 'submission'

In [3]:
if not os.path.exists(REPO_DATA):
    os.mkdir(REPO_DATA)
if not os.path.exists(REPO_DATA_PREPROCESSED):
    os.mkdir(REPO_DATA_PREPROCESSED)
if not os.path.exists(REPO_MODEL):
    os.mkdir(REPO_MODEL)
if not os.path.exists(REPO_SUBMISSION):
    os.mkdir(REPO_SUBMISSION)

## Load data

In [4]:
import zipfile

if not os.path.exists(os.path.join(REPO_DATA, 'train.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'test.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'sample_submission.csv')):
    
    !kaggle competitions download -c spaceship-titanic -p $REPO_DATA
    
    with zipfile.ZipFile(os.path.join(REPO_DATA, 'spaceship-titanic.zip'), 'r') as zip_ref:
        zip_ref.extractall(REPO_DATA)
        
df_train = pd.read_csv(os.path.join(REPO_DATA, 'train.csv'))
df_test = pd.read_csv(os.path.join(REPO_DATA, 'test.csv'))

In [5]:
if not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "X_train.csv")) \
or not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "X_test.csv")) \
or not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "y_train.csv")) \
or not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "list_passenger_id.csv")):
    
    X_train, X_test, y_train, list_passenger_id = dataframe_preprocessing(df_train, df_test)
    X_train.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_train.csv"), index=False)
    X_test.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_test.csv"), index=False)
    y_train.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "y_train.csv"), index=False)
    list_passenger_id.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "list_passenger_id.csv"), index=False)
    
else:
    
    X_train = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_train.csv"))
    X_test = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_test.csv"))
    y_train = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "y_train.csv")).Transported
    list_passenger_id = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "list_passenger_id.csv")).PassengerId

## Kaggle submission function

In [6]:
def analyze_model(model_name, repo_model, repo_submission, X_train, y_train, X_test, list_passenger_id):
    
    with open(os.path.join(repo_model, model_name + ".json"), 'r') as file:
        data = json.load(file)
        
    print("Model name: {}".format(data.get("model_name")))
    print("Cross validation score: {:.5f}".format(data.get("best_score")))
    print("Best params: {}".format(data.get("best_params")))
    print("Time to tune: {} seconds".format(int(data.get("time_to_tune"))))
    
    score_kaggle = data.get("score_kaggle")
    
    if score_kaggle == None:
    
        pipeline = get_pipeline(model_name=model_name, X_train=X_train, y_train=y_train)
        pipeline = pipeline.set_params(**data["best_params"])
        pipeline = pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        submission = pd.DataFrame({'PassengerId': list_passenger_id, 'Transported': y_pred})
        submission.to_csv(os.path.join(repo_submission, model_name + ".csv"), index=False)
            
        path = os.path.join(repo_submission, model_name + ".csv")
        request = !kaggle competitions submit -c spaceship-titanic -f $path -m $model_name"
                
        time.sleep(5)
        
        response = !kaggle competitions submissions -c spaceship-titanic --csv
        score_kaggle = float(response[2].split(',')[4])
        
        data["score_kaggle"] = score_kaggle
        
        with open(os.path.join(repo_model, model_name + ".json"), 'w+') as file:
            json.dump(data, file)
        
    print("Kaggle submission score: {}".format(score_kaggle))

## Basic features

### Pipeline v1

#### Logistic regression

In [53]:
MODEL_NAME = "logistic_regression__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME
    
analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v01
Cross validation score: 0.77948
Best params: {'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}
Time to tune: 1476 seconds
Kaggle submission score: 0.78442


#### KNN

In [54]:
MODEL_NAME = "knn__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v01
Cross validation score: 0.78880
Best params: {'knn__n_neighbors': 47, 'knn__p': 2, 'knn__weights': 'uniform'}
Time to tune: 5311 seconds
Kaggle submission score: 0.78793


#### SVM

In [55]:
MODEL_NAME = "svm__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v01
Cross validation score: 0.80053
Best params: {'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}
Time to tune: 2623 seconds
Kaggle submission score: 0.80032


### Pipeline v2

#### Logistic regression

In [56]:
MODEL_NAME = "logistic_regression__v02"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v02
Cross validation score: 0.78155
Best params: {'feature_selection__k': 15, 'logistic__C': 0.021544346900318822, 'logistic__l1_ratio': 0.1, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}
Time to tune: 12903 seconds
Kaggle submission score: 0.78349


#### KNN

In [58]:
MODEL_NAME = "knn__v02"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v02
Cross validation score: 0.79029
Best params: {'feature_selection__k': 18, 'knn__n_neighbors': 31, 'knn__p': 1, 'knn__weights': 'uniform'}
Time to tune: 64065 seconds
Kaggle submission score: 0.78442


#### SVM

In [57]:
MODEL_NAME = "svm__v02"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v02
Cross validation score: 0.80122
Best params: {'feature_selection__k': 20, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}
Time to tune: 307954 seconds
Kaggle submission score: 0.79892


### Pipeline v3

#### Logistic regression

In [None]:
MODEL_NAME = "logistic_regression__v03"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### KNN

In [7]:
MODEL_NAME = "knn__v03"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v03
Cross validation score: 0.79593
Best params: {'feature_selection__k': 13, 'knn__n_neighbors': 48, 'knn__p': 1, 'knn__weights': 'uniform'}
Time to tune: 75335 seconds
Kaggle submission score: 0.78887


#### SVM

In [None]:
MODEL_NAME = "svm__v03"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

### Pipeline v4

#### Logistic regression

In [None]:
MODEL_NAME = "logistic_regression__v04"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### KNN

In [None]:
MODEL_NAME = "knn__v04"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### SVM

In [None]:
MODEL_NAME = "svm__v04"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

### Pipeline v5

#### Logistic regression

In [None]:
MODEL_NAME = "logistic_regression__v05"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### KNN

In [None]:
MODEL_NAME = "knn__v05"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### SVM

In [None]:
MODEL_NAME = "svm__v05"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

### Pipeline v6

#### Logistic regression

In [None]:
MODEL_NAME = "logistic_regression__v06"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### KNN

In [None]:
MODEL_NAME = "knn__v06"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### SVM

In [None]:
MODEL_NAME = "svm__v06"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

### Pipeline v7

#### Logistic regression

In [None]:
MODEL_NAME = "logistic_regression__v07"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### KNN

In [None]:
MODEL_NAME = "knn__v07"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### SVM

In [None]:
MODEL_NAME = "svm__v07"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

### Pipeline v8

#### Logistic regression

In [29]:
MODEL_NAME = "logistic_regression__v08"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Traceback (most recent call last):
  File "C:\Users\kevin\Kaggle\Titanic_Spaceship\script_tuning.py", line 16, in <module>
    pipeline = get_pipeline(model_name=MODEL_NAME, X_train=X_train, y_train=y_train)
  File "C:\Users\kevin\Kaggle\Titanic_Spaceship\titanic_spaceship_package\get_pipeline.py", line 39, in get_pipeline
    rf = RandomForestClassifier(random_state=42, max_jobs=-1)
TypeError: RandomForestClassifier.__init__() got an unexpected keyword argument 'max_jobs'


FileNotFoundError: [Errno 2] No such file or directory: 'model\\logistic_regression__v08.json'

#### KNN

In [None]:
MODEL_NAME = "knn__v08"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### SVM

In [None]:
MODEL_NAME = "svm__v08"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

### Pipeline v9

#### Logistic regression

In [29]:
MODEL_NAME = "logistic_regression__v09"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Traceback (most recent call last):
  File "C:\Users\kevin\Kaggle\Titanic_Spaceship\script_tuning.py", line 16, in <module>
    pipeline = get_pipeline(model_name=MODEL_NAME, X_train=X_train, y_train=y_train)
  File "C:\Users\kevin\Kaggle\Titanic_Spaceship\titanic_spaceship_package\get_pipeline.py", line 39, in get_pipeline
    rf = RandomForestClassifier(random_state=42, max_jobs=-1)
TypeError: RandomForestClassifier.__init__() got an unexpected keyword argument 'max_jobs'


FileNotFoundError: [Errno 2] No such file or directory: 'model\\logistic_regression__v08.json'

#### KNN

In [None]:
MODEL_NAME = "knn__v09"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### SVM

In [None]:
MODEL_NAME = "svm__v09"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

## Analysis

In [8]:
VERSIONS = [str(x) if x > 9 else "0" + str(x) for x in range(1, 10)]
COLUMNS = ["Version", "Description", "Best_score", "Time_to_tune", "Score_kaggle", "Best_params"]
DESCRIPTION_VERSIONS = {
    "01": "Basic feature",
    "02": "Basic feature with feature selection f_classif",
    "03": "Basic feature with feature selection MI n=1",
    "04": "Basic feature with feature selection MI n=2",
    "05": "Basic feature with feature selection MI n=3",
    "06": "Basic feature with feature selection MI n=4",
    "07": "Basic feature with feature selection MI n=5",
    "08": "Basic feature with feature selection from rf",
    "09": "Basic feature with feature selection from ridge",
}

### Logistic regression

In [12]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_MODEL = "logistic_regression"

for version in VERSIONS:
    
    if os.path.exists(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json")):
        with open(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Version": [version], 
                "Description": DESCRIPTION_VERSIONS.get(version),
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
                
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Version,Description,Best_score,Time_to_tune,Score_kaggle,Best_params
0,1,Basic feature,0.779479,1476,0.78442,"{'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}"
1,2,Basic feature with feature selection f_classif,0.781549,12903,0.78349,"{'feature_selection__k': 15, 'logistic__C': 0.021544346900318822, 'logistic__l1_ratio': 0.1, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"


### KNN

In [11]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_MODEL = "knn"

for version in VERSIONS:
    
    if os.path.exists(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json")):
        with open(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Version": [version], 
                "Description": DESCRIPTION_VERSIONS.get(version), 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Version,Description,Best_score,Time_to_tune,Score_kaggle,Best_params
0,1,Basic feature,0.788799,5311,0.78793,"{'knn__n_neighbors': 47, 'knn__p': 2, 'knn__weights': 'uniform'}"
1,2,Basic feature with feature selection f_classif,0.790294,64065,0.78442,"{'feature_selection__k': 18, 'knn__n_neighbors': 31, 'knn__p': 1, 'knn__weights': 'uniform'}"
2,3,Basic feature with feature selection MI n=1,0.795932,75335,0.78887,"{'feature_selection__k': 13, 'knn__n_neighbors': 48, 'knn__p': 1, 'knn__weights': 'uniform'}"


### SVM 

In [13]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_MODEL = "svm"

for version in VERSIONS:
    
    if os.path.exists(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json")):
        with open(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Version": [version], 
                "Description": DESCRIPTION_VERSIONS.get(version), 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Version,Description,Best_score,Time_to_tune,Score_kaggle,Best_params
0,1,Basic feature,0.800532,2623,0.80032,"{'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
1,2,Basic feature with feature selection f_classif,0.801222,307954,0.79892,"{'feature_selection__k': 20, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
