<a href="https://colab.research.google.com/github/MStamirski/Spaceship-Titanic/blob/main/Model_LogReg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

In [None]:
from google.colab import drive
drive.mount("/content/drive")
%cd "/content/drive/MyDrive/Colab_Notebooks/SDA_upskill/Spaceship"

# Model with parameters

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
model = LogisticRegression(max_iter = 100)

In [None]:
def objective(trial):

  logreg_c = trial.suggest_float('logreg_c', 1e-10, 1e10, log=True)

  optuna_model = LogisticRegression(C=logreg_c)
  optuna_model.fit(X_train, y_train)

  y_pred = optuna_model.predict(X_val)
  accuracy = accuracy_score(y_val, y_pred)

  return accuracy

# Features datasets

In [None]:
!pip install import-ipynb

In [None]:
import import_ipynb

In [None]:
from FeaturesEngineering import get_features, categories_one_hot_encoding, categories_target_encoding, categories_leave_one_out_encoding

In [None]:
df = get_features('train')
df_ohe = categories_one_hot_encoding(df)
df_te = categories_target_encoding(df)
df_looe = categories_leave_one_out_encoding(df)

# Initial verification

In [None]:
from Optimization import verify_feature_dataset, get_subsets, model_optimization, test_tuned_model

In [None]:
acc_ohe_ini = verify_feature_dataset(df_ohe, model)

Accuracy: 77.45398773006134%


In [None]:
acc_te_ini = verify_feature_dataset(df_te, model)

Accuracy: 76.11196319018406%


In [None]:
acc_looe_ini = verify_feature_dataset(df_looe, model)

Accuracy: 75.76687116564416%


# Parameters tuning

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(df_ohe)
params = model_optimization(100, objective)
model = LogisticRegression(C=params['logreg_c'])
result_ohe_tuned = test_tuned_model(model, X_train, y_train, X_test, y_test)

[32m[I 2023-04-08 18:47:04,766][0m A new study created in memory with name: no-name-d7611f13-b91f-4a82-a4ac-b1f2174611a6[0m
[32m[I 2023-04-08 18:47:04,839][0m Trial 0 finished with value: 0.4996405463695183 and parameters: {'logreg_c': 1.0497801206477555e-06}. Best is trial 0 with value: 0.4996405463695183.[0m
[32m[I 2023-04-08 18:47:04,896][0m Trial 1 finished with value: 0.4996405463695183 and parameters: {'logreg_c': 1.7623800946617488e-08}. Best is trial 0 with value: 0.4996405463695183.[0m
[32m[I 2023-04-08 18:47:04,963][0m Trial 2 finished with value: 0.4996405463695183 and parameters: {'logreg_c': 2.7853163861265793e-07}. Best is trial 0 with value: 0.4996405463695183.[0m
[32m[I 2023-04-08 18:47:05,181][0m Trial 3 finished with value: 0.792235801581596 and parameters: {'logreg_c': 280668771.5240519}. Best is trial 3 with value: 0.792235801581596.[0m
[32m[I 2023-04-08 18:47:05,296][0m Trial 4 finished with value: 0.792235801581596 and parameters: {'logreg_c': 169


Number of finished trials: 100
Best trial:
Value: 0.798705966930266
  Params: 
    logreg_c: 0.046692080400102885

Accuracy after tuning: 78.09085681426107%

Classification report
              precision    recall  f1-score   support

       False       0.79      0.76      0.77       861
        True       0.77      0.81      0.79       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739


Confusion matrix
[[651 210]
 [171 707]]

All features number: 90, Selected features number: 30
Selected_features:

HomePlanet_Earth
HomePlanet_Europa
CryoSleep_False
CryoSleep_True
Cabin_deck_C
Cabin_deck_G
Cabin_side_S
Cabin_num/100_3
Cabin_num/100_4
Cabin_num/100_8
Cabin_num/100_9
Cabin_num/100_10
Cabin_num/100_11
RService_deciles_1
RService_deciles_7
RService_deciles_8
RService_deciles_10
FCourt_deciles_8
FCourt_deciles_10
ShMall_deciles_8
ShMall_deciles_10
Spa_deciles_1
Spa_deciles_

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(df_te)
params = model_optimization(100, objective)
model = LogisticRegression(C=params['logreg_c'])
result_te_tuned = test_tuned_model(model, X_train, y_train, X_test, y_test)

[32m[I 2023-04-08 18:47:20,758][0m A new study created in memory with name: no-name-08766225-3c88-4338-b6da-def67d67229f[0m
[32m[I 2023-04-08 18:47:20,905][0m Trial 0 finished with value: 0.7685118619698059 and parameters: {'logreg_c': 41.70161962126884}. Best is trial 0 with value: 0.7685118619698059.[0m
[32m[I 2023-04-08 18:47:21,020][0m Trial 1 finished with value: 0.7685118619698059 and parameters: {'logreg_c': 585517.877251529}. Best is trial 0 with value: 0.7685118619698059.[0m
[32m[I 2023-04-08 18:47:21,060][0m Trial 2 finished with value: 0.4996405463695183 and parameters: {'logreg_c': 9.911431912698235e-09}. Best is trial 0 with value: 0.7685118619698059.[0m
[32m[I 2023-04-08 18:47:21,194][0m Trial 3 finished with value: 0.7685118619698059 and parameters: {'logreg_c': 16533224.475058682}. Best is trial 0 with value: 0.7685118619698059.[0m
[32m[I 2023-04-08 18:47:21,227][0m Trial 4 finished with value: 0.4996405463695183 and parameters: {'logreg_c': 2.877676499


Number of finished trials: 100
Best trial:
Value: 0.7728253055355859
  Params: 
    logreg_c: 0.992814554990874

Accuracy after tuning: 76.42323174238068%

Classification report
              precision    recall  f1-score   support

       False       0.76      0.77      0.76       861
        True       0.77      0.76      0.77       878

    accuracy                           0.76      1739
   macro avg       0.76      0.76      0.76      1739
weighted avg       0.76      0.76      0.76      1739


Confusion matrix
[[661 200]
 [210 668]]

All features number: 14, Selected features number: 7
Selected_features:

HomePlanet_transformed
Destination_transformed
Cabin_side_transformed
Cabin_num/100_transformed
RService_deciles_transformed
Spa_deciles_transformed
VRD_deciles_transformed


In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(df_looe)
params = model_optimization(100, objective)
model = LogisticRegression(C=params['logreg_c'])
result_looe_tuned = test_tuned_model(model, X_train, y_train, X_test, y_test)

[32m[I 2023-04-08 18:47:29,741][0m A new study created in memory with name: no-name-f7deecfb-103e-41bc-8bfe-c430f99a58bc[0m
[32m[I 2023-04-08 18:47:29,811][0m Trial 0 finished with value: 0.7649173256649893 and parameters: {'logreg_c': 1.2102964023319742}. Best is trial 0 with value: 0.7649173256649893.[0m
[32m[I 2023-04-08 18:47:29,850][0m Trial 1 finished with value: 0.4996405463695183 and parameters: {'logreg_c': 4.0549280246460917e-08}. Best is trial 0 with value: 0.7649173256649893.[0m
[32m[I 2023-04-08 18:47:29,920][0m Trial 2 finished with value: 0.7641984184040259 and parameters: {'logreg_c': 648769.1996250073}. Best is trial 0 with value: 0.7649173256649893.[0m
[32m[I 2023-04-08 18:47:29,972][0m Trial 3 finished with value: 0.7584471603163192 and parameters: {'logreg_c': 0.08384599666282412}. Best is trial 0 with value: 0.7649173256649893.[0m
[32m[I 2023-04-08 18:47:30,001][0m Trial 4 finished with value: 0.7476635514018691 and parameters: {'logreg_c': 0.00082


Number of finished trials: 100
Best trial:
Value: 0.7656362329259525
  Params: 
    logreg_c: 0.6041391891310322

Accuracy after tuning: 76.19321449108683%

Classification report
              precision    recall  f1-score   support

       False       0.75      0.77      0.76       861
        True       0.77      0.75      0.76       878

    accuracy                           0.76      1739
   macro avg       0.76      0.76      0.76      1739
weighted avg       0.76      0.76      0.76      1739


Confusion matrix
[[665 196]
 [218 660]]

All features number: 14, Selected features number: 7
Selected_features:

HomePlanet_transformed
Destination_transformed
Cabin_side_transformed
Cabin_num/100_transformed
RService_deciles_transformed
Spa_deciles_transformed
VRD_deciles_transformed


# Save results

In [None]:
data = []
data.append(['LogisticRegression', 'OHE', acc_ohe_ini, result_ohe_tuned[0], str(result_ohe_tuned[1])+" / 90"])
data.append(['LogisticRegression', 'TE', acc_te_ini, result_te_tuned[0], str(result_te_tuned[1])+" / 14"])
data.append(['LogisticRegression', 'LOOE', acc_looe_ini, result_looe_tuned[0], str(result_looe_tuned[1])+" / 14"])

In [None]:
import csv
from os.path import exists
resfile = 'spaceship_results.csv'

In [None]:
if exists(resfile):
  f = open(resfile, 'a')
  writer = csv.writer(f)
else:
  header = ['Model', 'Categories_encoding', 'Initial_accuracy', 'Tuned_Accuracy', 'Important_Features']
  f = open(resfile, 'w', newline='')
  writer = csv.writer(f)
  writer.writerow(header)

writer.writerows(data)
f.close()