**Выбор алгоритма с помощью LazyPredict**

# [Kaggle — Spaceship Titanic](https://www.kaggle.com/competitions/spaceship-titanic/overview)

**Входные данные:** частично предобработанные при исследовательском анализе данные.


**Цель проекта:** экспресс-тестирование разных алгоритомов.


**Задачи проекта:**

## Начальная подготовка

### Импорты

In [1506]:
import pandas as pd
import numpy as np

import os
import json
import warnings
from datetime import date, time, datetime
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline, make_pipeline

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

from feature_engine.selection import DropFeatures
from feature_engine.encoding import MeanEncoder, DecisionTreeEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SVMSMOTE

from lazypredict.Supervised import LazyClassifier

### Константы

In [1507]:
RANDOM_STATE = RS = 99    # константа для параметра random_state
CR = '\n'                 # новая строка

### Вспомогательные функции 

In [1508]:
def custom_read_csv(path_local, separator=','):
    """
    чтение датасета:
      сначала из локального хранилища;
      при неудаче — из удаленного хранилища (добавление слеша – особенности синтаксиса линукса или как там его)
    """

    path_remote = '/' + path_local

    if os.path.exists(path_local):
        return pd.read_csv(path_local, sep=separator)

    elif os.path.exists(path_remote):
        return pd.read_csv(path_remote, sep=separator)

    else:
        print('Ошибка')

### Оформление

In [1509]:
# выделение в тексте
class f:
    BOLD = "\033[1m"
    ITALIC = "\033[3m"
    END = "\033[0m"

In [1510]:
# оформление Pandas
pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 500
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:.3f}'.format
pd.options.display.colheader_justify = 'left'
# pd.options.display.precision = 7
# pandas.options.mode.use_inf_as_na = True

In [1511]:
# оформление — прочее
warnings.filterwarnings('ignore')

## Входные данные

### Чтение данных

In [1512]:
data_train = custom_read_csv('datasets/data_train.csv')  # пасссажиры, для которых известно значение Survived

## Данные для моделей

### Feature Engineering

Продолжение. Первая часть выполнена в EDA.

#### Drop Features — удаление ненужных полей

Можно добавить в паплайн, но там больше косяков. Например, удаленные поля все равно выводятся в графике Feature_Importance.

In [1513]:
data_train.sample()

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Name,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Earth_TRAPPIST,Age_group,Food_Shopping,Room_Spa_Deck,Total_spents,is_Total_spents,Group,is_Alone,Group_Transported,Cab_1,Cab_2,Cab_3,Name_FL,Name_SL
2798,3021_02,Earth,F/574/S,TRAPPIST-1e,Billey Spencervan,0.0,37.0,0.0,6.41,0.0,1.792,0.0,7.401,0.0,1,50,1.792,7.716,7.719,0,3021,0,1,F,574,S,B,S


In [1514]:
data_train = data_train.drop([
                              # исходные признаки
                              'PassengerId',
                              'Cabin',
                              'Name',
#                               'Destination',
                              'RoomService',
                              'FoodCourt',
                              'ShoppingMall',
                              'Spa',
                              'VRDeck',
                              'Age',
#                               'VIP',
                              # сгенерированные на этапе EDA
                              'Group',
                              'Cabin_num',
                              'Surname',
#                               'Earth_TRAPPIST',
                             ],
                             axis=1)

In [1515]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HomePlanet         8693 non-null   object 
 1   CryoSleep          8693 non-null   float64
 2   Transported        8693 non-null   float64
 3   Earth_TRAPPIST     8693 non-null   int64  
 4   Age_group          8693 non-null   int64  
 5   Food_Shopping      8693 non-null   float64
 6   Room_Spa_Deck      8693 non-null   float64
 7   Total_spents       8693 non-null   float64
 8   is_Total_spents    8693 non-null   int64  
 9   is_Alone           8693 non-null   int64  
 10  Group_Transported  8693 non-null   int64  
 11  Cab_1              8693 non-null   object 
 12  Cab_3              8693 non-null   object 
 13  Name_FL            8493 non-null   object 
 14  Name_SL            8493 non-null   object 
dtypes: float64(5), int64(5), object(5)
memory usage: 1018.8+ KB


### Выделение признаков и целевой переменной

In [1516]:
X = data_train.drop('Transported', axis=1)
Y = data_train.Transported

In [1517]:
X.shape, Y.shape

((8693, 14), (8693,))

### Разделение на обучающую и тестовую выборки

Этот test – это часть, отрезанная от train. Нужен для локальной проверки модели.

In [1518]:
# разделение на обучающую и тестовую выборки
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=RS)

# проверка размеров выборок
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((6954, 14), (6954,), (1739, 14), (1739,))

## Модель

### Drop Features

катастрофически замедляет выполнение

In [1519]:
# drop_features_list = [
#                       'PassengerId',
#                       'Cabin',
#                       'Name',
#                       'Destination',
#                       'RoomService',
#                       'FoodCourt',
#                       'ShoppingMall',
#                       'Spa',
#                       'VRDeck',
# #                       'Total_spents',
#                       'Age',
#                       'VIP',
#                       'Group',
#                      ]

# dropper = make_column_transformer(
#                                   ('drop', drop_features_list),
#                                    remainder='passthrough'
#                                  )

### Селекторы числовых и категориальных признаков

In [1520]:
selector_num = make_column_selector(dtype_include=np.number)
selector_cat = make_column_selector(dtype_exclude=np.number)

In [1521]:
selector_num(X)

['CryoSleep',
 'Earth_TRAPPIST',
 'Age_group',
 'Food_Shopping',
 'Room_Spa_Deck',
 'Total_spents',
 'is_Total_spents',
 'is_Alone',
 'Group_Transported']

In [1522]:
selector_cat(X)

['HomePlanet', 'Cab_1', 'Cab_3', 'Name_FL', 'Name_SL']

### Предбработка числовых признаков

In [1523]:
num_preprocessor = make_pipeline(
                                 StandardScaler(),
#                                IterativeImputer(initial_strategy='mean', random_state=RS),  # сделано на этапе EDA
                                )

### Предбработка категориальных признаков

In [1524]:
cat_preprocessor = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')

### Объединение предобработки

In [1525]:
preprocessor = make_column_transformer(
#                                        ('drop', drop_features_list),
                                       (num_preprocessor, selector_num),
                                       (cat_preprocessor, selector_cat),
                                       remainder="passthrough"           # на будущее (в данном случае необязательно)
                                      )

### Сборка пайплайна

In [1526]:
preprocessing = make_pipeline(
#                               dropper,
                              preprocessor
                             )
preprocessing

### Остатки — доработать!

#### Feature Engineering

In [1527]:
# MeanEncoder_cat = MeanEncoder()
# MeanEncoder_all = MeanEncoder(ignore_format=True)
# # MeanEncoder_sel = ('MeanCategory_other', MeanEncoder(variables=[''], ignore_format=True))

In [1528]:
MeanEncoder_cat = make_column_transformer(
                                          (
                                           MeanEncoder(),
                                           make_column_selector(dtype_exclude=['number'])
                                          ),
                                          remainder="passthrough"
                                         )

### Препроцессинг целевой переменной

Балансировка и т.п.

In [1529]:
balansing = SVMSMOTE(random_state=RS)

## Предобработка данных

С помощью пайплайна

In [1530]:
X_train = preprocessing.fit_transform(X_train, Y_train)
X_test = preprocessing.transform(X_test)

## LazyClassifier

Похоже, LazyClassifier на данный момент не поддерживает Pipeline.

In [1531]:
lazyClf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, predictions=False)

In [1532]:
# model_dictionary = clf.provide_models(X_train, X_test, Y_train, Y_test)
# model_dictionary

### StratifiedKFold кросс-валидация

In [1533]:
N = 5
cv = StratifiedKFold(n_splits=N)

In [1534]:
models_cv = []

for index_train, index_valid in cv.split(X_train, Y_train):
    
    models, predictions = lazyClf.fit(
                                      X_train[index_train],
                                      X_train[index_valid],
                                      Y_train.iloc[index_train],
                                      Y_train.iloc[index_valid],
                                     )
    models_cv.append(models)

100%|██████████| 29/29 [00:16<00:00,  1.75it/s]


'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:16<00:00,  1.73it/s]


'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:16<00:00,  1.76it/s]


'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:16<00:00,  1.75it/s]


'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:17<00:00,  1.69it/s]


In [1535]:
# усреднение всех пачек

models_cv_mean = models_cv[0]

for i in range(1, len(models_cv)):
    models_cv_mean +=  models_cv[i]

models_cv_mean = models_cv_mean / N

In [1536]:
# результат

models_cv_mean.sort_values('Accuracy', ascending=False)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.804,0.804,0.804,0.804,0.58
LGBMClassifier,0.801,0.801,0.801,0.801,0.116
XGBClassifier,0.8,0.8,0.8,0.8,0.218
AdaBoostClassifier,0.793,0.792,0.792,0.792,0.383
LogisticRegression,0.79,0.79,0.79,0.79,0.058
CalibratedClassifierCV,0.789,0.789,0.789,0.789,4.177
BaggingClassifier,0.789,0.789,0.789,0.788,0.245
LinearSVC,0.788,0.788,0.788,0.788,1.2
ExtraTreesClassifier,0.787,0.787,0.787,0.786,0.628
LinearDiscriminantAnalysis,0.786,0.786,0.786,0.786,0.063


### RepeatedStratifiedKFold кросс-валидация

In [1537]:
# N = 10
# R = 3
# cv = RepeatedStratifiedKFold(n_splits=N, n_repeats=R)

In [1538]:
# models_cv = []

# for index_train, index_valid in cv.split(X_train, Y_train):
    
#     models, predictions = clf.fit(
#                                   X_train[index_train],
#                                   X_train[index_valid],
#                                   Y_train.iloc[index_train],
#                                   Y_train.iloc[index_valid]
#                                  )
#     models_cv.append(models)

In [1539]:
# # усреднение всех пачек

# models_cv_mean = models_cv[0]

# for i in range(1, len(models_cv)):
#     models_cv_mean +=  models_cv[i]

# models_cv_mean = models_cv_mean / N / R

In [1540]:
# # результат

# models_cv_mean.sort_values('Accuracy', ascending=False)