We won't consider decision trump and simple decision tree due to lower results than other models by metric kaggle(accuracy): accordingly ~59% и ~70% unlike ~76%.

In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

from functions_for_titanik import make_prediction_file

In [2]:
#Path to data

PATH_TO_X_SAMPLE = "X_sample.csv"
PATH_TO_Y_SAMPLE = "y_sample.csv"

In [3]:
#Loading data

X_sample = pd.read_csv(PATH_TO_X_SAMPLE, index_col="PassengerId")
y_sample = pd.read_csv(PATH_TO_Y_SAMPLE, index_col="PassengerId")

In [4]:
X_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       891 non-null    int64  
 1   Age          891 non-null    float64
 2   SibSp        891 non-null    int64  
 3   Parch        891 non-null    int64  
 4   Fare         891 non-null    float64
 5   Family_size  891 non-null    int64  
 6   Sex_male     891 non-null    bool   
 7   Embarked_Q   891 non-null    bool   
 8   Embarked_S   891 non-null    bool   
dtypes: bool(3), float64(2), int64(4)
memory usage: 51.3 KB


In [5]:
 #Disconnection

k_disconnection = (7 * 891) // 10
X_train, X_test = X_sample.iloc[:k_disconnection], X_sample.iloc[k_disconnection:]
y_train, y_test = y_sample.iloc[:k_disconnection], y_sample.iloc[k_disconnection:]

In [6]:
#Checking the data status #1
print("X_train:")
X_train.info()
print(" \nX_test:")
X_test.info()

X_train:
<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 1 to 623
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       623 non-null    int64  
 1   Age          623 non-null    float64
 2   SibSp        623 non-null    int64  
 3   Parch        623 non-null    int64  
 4   Fare         623 non-null    float64
 5   Family_size  623 non-null    int64  
 6   Sex_male     623 non-null    bool   
 7   Embarked_Q   623 non-null    bool   
 8   Embarked_S   623 non-null    bool   
dtypes: bool(3), float64(2), int64(4)
memory usage: 35.9 KB
 
X_test:
<class 'pandas.core.frame.DataFrame'>
Index: 268 entries, 624 to 891
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       268 non-null    int64  
 1   Age          268 non-null    float64
 2   SibSp        268 non-null    int64  
 3   Parch        268 non-null    int64  
 4   Fa

GridSearchCV

In [7]:
clf = DecisionTreeClassifier()

parameters = {"criterion": ["entropy", "gini"],
              "max_depth": range(1, 10), 
              "min_samples_split": range(2, 10),
              "min_samples_leaf": range(1, 10)
              }

grid_cv = GridSearchCV(clf, parameters, cv=5)

As you can see, the metrics show quite good results, but the hyperparameters are unstable, let's try to fix it

In [8]:
%%time

grid_cv.fit(X_train, y_train)
best_model_grid = grid_cv.best_estimator_
print("Best params:", best_model_grid)
print(classification_report(y_test, best_model_grid.predict(X_test), target_names=["Died", "Survived"]))

Best params: DecisionTreeClassifier(max_depth=7, min_samples_leaf=4, min_samples_split=4)
              precision    recall  f1-score   support

        Died       0.81      0.91      0.86       172
    Survived       0.80      0.62      0.70        96

    accuracy                           0.81       268
   macro avg       0.81      0.77      0.78       268
weighted avg       0.81      0.81      0.80       268

CPU times: total: 21.2 s
Wall time: 22.2 s


RandomizedSearchCV

In [9]:
rand_grid_cv = RandomizedSearchCV(clf, parameters, cv=5)

Note that each precision and recall has a spread of +-6, we would like precision and recall to be constant and at the same time the highest

In [10]:
#Simple RandomizedSearchCV

rand_grid_cv.fit(X_train, y_train)
print(classification_report(y_test, rand_grid_cv.predict(X_test), target_names=["Died", "Survived"]))

              precision    recall  f1-score   support

        Died       0.80      0.95      0.87       172
    Survived       0.87      0.56      0.68        96

    accuracy                           0.81       268
   macro avg       0.83      0.76      0.78       268
weighted avg       0.82      0.81      0.80       268



Repeating the code block several times, we will see that the most frequent  totals are: "gini", 7, 4, 6

In [544]:
%%time

#Create start value our parameters
md_lst = []
msl_lst = []
mss_lst = []

entropy_count, gini_count = 0, 0
md_value = -1
msl_value = -1
mss_value = -1

#Going through 1000 RandomizedSearchCV and save parametrs
for model in range(1000):
    rand_grid_cv.fit(X_train, y_train)
    local_best = rand_grid_cv.best_estimator_

    if local_best.criterion == "entropy":
        entropy_count += 1
    else:
        gini_count += 1
    
    md_lst.append(local_best.max_depth)
    msl_lst.append(local_best.min_samples_leaf)
    mss_lst.append(local_best.min_samples_split)

#We choose the categorical value according to the fashion, and the numerical values according to the median
if entropy_count > gini_count:
    criterion_value = "entropy"
elif entropy_count == gini_count:
    criterion_value = "gini"
    print("!")
else:
    criterion_value = "gini"

lst_start_values = [md_lst, msl_lst, mss_lst]
lst_end_values = []
for value in lst_start_values:
    value = sorted(value)
    lst_end_values.append((value[499] + value[500]) // 2)

#Interpreting the result
best_model_rand = DecisionTreeClassifier(criterion=criterion_value, max_depth=lst_end_values[0], min_samples_leaf=lst_end_values[1], min_samples_split=lst_end_values[2])
best_model_rand.fit(X_train, y_train)
print(best_model_rand.criterion, best_model_rand.max_depth, best_model_rand.min_samples_leaf, best_model_rand.min_samples_split)
print(classification_report(y_test, best_model_rand.predict(X_test), target_names=["Died", "Survived"]))

gini 7 4 6
              precision    recall  f1-score   support

        Died       0.81      0.91      0.86       172
    Survived       0.80      0.62      0.70        96

    accuracy                           0.81       268
   macro avg       0.81      0.77      0.78       268
weighted avg       0.81      0.81      0.80       268

CPU times: total: 2min 38s
Wall time: 2min 59s


We are quite satisfied with these hyperparameters, but we will try to improve their result using these as a basis

In [463]:
#Use hyperparametrs from code block higher
repitmodel_best_rand = DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=6)
repitmodel_best_rand.fit(X_train, y_train)
print(classification_report(y_test, repitmodel_best_rand.predict(X_test), target_names=["Died", "Survived"]))

              precision    recall  f1-score   support

        Died       0.81      0.91      0.86       172
    Survived       0.80      0.62      0.70        96

    accuracy                           0.81       268
   macro avg       0.81      0.77      0.78       268
weighted avg       0.81      0.81      0.80       268



Without CV

This decision tree has all increased metric values, and for Survived precision they are higher by 0.04, for Died recall they are higher by 0.03

In [515]:
without_cv_1 = DecisionTreeClassifier(criterion="gini", max_depth=3)
without_cv_1.fit(X_train, y_train)
whc_pred_1 = without_cv_1.predict(X_test)
print(classification_report(y_test, whc_pred_1, target_names=["Died", "Survived"]))

              precision    recall  f1-score   support

        Died       0.82      0.94      0.88       172
    Survived       0.86      0.64      0.73        96

    accuracy                           0.83       268
   macro avg       0.84      0.79      0.80       268
weighted avg       0.84      0.83      0.83       268



This decision tree has almost all the increased metric values, and for Survived recall they are 0.06 higher

In [518]:
without_cv_2 = DecisionTreeClassifier(criterion="entropy", max_depth=5, min_samples_leaf=4)
without_cv_2.fit(X_train, y_train)
whc_pred_2 = without_cv_2.predict(X_test)
print(classification_report(y_test, whc_pred_2, target_names=["Died", "Survived"]))

              precision    recall  f1-score   support

        Died       0.84      0.92      0.88       172
    Survived       0.82      0.68      0.74        96

    accuracy                           0.83       268
   macro avg       0.83      0.80      0.81       268
weighted avg       0.83      0.83      0.83       268



This decision tree has has a significant decrease for Dead recall by 0.09 and Survived precision by 0.07, but has a significant increase for Survived recall by 0.09 and for Dead precision by 0.03

In [520]:
without_cv_3 = DecisionTreeClassifier(max_depth=2)
without_cv_3.fit(X_train, y_train)
whc_pred_3 = without_cv_3.predict(X_test)
print(classification_report(y_test, whc_pred_3, target_names=["Died", "Survived"]))

              precision    recall  f1-score   support

        Died       0.84      0.84      0.84       172
    Survived       0.71      0.71      0.71        96

    accuracy                           0.79       268
   macro avg       0.77      0.77      0.77       268
weighted avg       0.79      0.79      0.79       268



This tree is similar to the previous one according to the results of metrics, only some metric values are higher or equal, only Survived recall can be lower or equal

In [539]:
without_cv_4 = DecisionTreeClassifier(criterion="gini", min_samples_split=10)
without_cv_4.fit(X_train, y_train)
whc_pred_4 = without_cv_4.predict(X_test)
print(classification_report(y_test,whc_pred_4, target_names=["Died", "Survived"]))

              precision    recall  f1-score   support

        Died       0.84      0.85      0.85       172
    Survived       0.73      0.71      0.72        96

    accuracy                           0.80       268
   macro avg       0.79      0.78      0.78       268
weighted avg       0.80      0.80      0.80       268



Prediction for kaggle data

In [542]:
PATH_TO_TEST = "X_test_kaggle.csv"
X_test_kaggle = pd.read_csv(PATH_TO_TEST, index_col="PassengerId")   # Test sample

In [543]:
#Make prediction files

all_models = [without_cv_1, without_cv_2, without_cv_3, without_cv_4]

for model_id in range(len(all_models)):
    model_pred = all_models[model_id].predict(X_test_kaggle)
    make_prediction_file(model_pred, X_test_kaggle, f"whc_pred_{model_id + 1}.csv")