# 1. Baseine

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = sns.load_dataset('titanic')
# удаление признаков
df.drop(columns=['alive', 'deck'], inplace=True)
# заполнение пропусков модой
df.fillna(df.mode().iloc[0], inplace=True)
df_label = pd.get_dummies(df, drop_first=True)

In [3]:
X = df_label.drop('survived', axis=1)
y = df_label['survived']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.2,
                                                    random_state=42)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [4]:
lr_base = LogisticRegression()
lr_base.fit(X_train_std, y_train)

# получение вероятностей
y_train_base = lr_base.predict_proba(X_train_std)
y_proba_base = lr_base.predict_proba(X_test_std)

print(f'ROC-AUC train = {roc_auc_score(y_train, y_train_base[:,1])}')
print(f'ROC-AUC test = {roc_auc_score(y_test, y_proba_base[:,1])}')

ROC-AUC train = 0.8690455330546447
ROC-AUC test = 0.8683135704874835


# 2. GridSearch

Теперь посмотрим, как изменится значение метрики при подборе

In [7]:
# создадим словарь с параметрами, которые хотим установить и перебрать
parameters = {
    'max_iter': [1000],
    'C': [1, 10, 100],
    "solver": ["liblinear", "saga", "sag", "lbfgs"],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    "random_state": [42],
    "class_weight" : ['balanced']
}

In [8]:
lr_grid = LogisticRegression()
# подаем на вход модель, словарь с параметрами, установим, на какую метрику ориентироваться
clf = GridSearchCV(estimator=lr_grid,
                   param_grid=parameters,
                   scoring='roc_auc',
                   cv=3,
                   verbose=0)
clf.fit(X_train_std, y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [1, 10, 100], 'class_weight': ['balanced'],
                         'max_iter': [1000],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'random_state': [42],
                         'solver': ['liblinear', 'saga', 'sag', 'lbfgs']},
             scoring='roc_auc')

In [9]:
# получение вероятностей
y_train_grid = clf.predict_proba(X_train_std)
y_proba_grid = clf.predict_proba(X_test_std)

print(f'ROC-AUC train = {roc_auc_score(y_train, y_train_grid[:,1])}')
print(f'ROC-AUC test = {roc_auc_score(y_test, y_proba_grid[:,1])}')

ROC-AUC train = 0.8694126678181349
ROC-AUC test = 0.8687088274044795


Видна небольшая положительная динамике как на train, так и на test.

In [18]:
clf.best_params_

{'C': 1,
 'class_weight': 'balanced',
 'max_iter': 1000,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'saga'}

In [20]:
clf.best_estimator_


LogisticRegression(C=1, class_weight='balanced', max_iter=1000, random_state=42,
                   solver='saga')

# 3 RandomizedSearch

In [21]:
from sklearn.model_selection import RandomizedSearchCV

# создадим словарь с параметрами, которые хотим установить и перебрать
parameters = {
    'max_iter': [1000],
    'C': [1, 10, 100],
    "solver": ["liblinear", "saga", "sag", "lbfgs"],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    "random_state": [42],
    "class_weight" : ['balanced']
}

In [22]:
lr_grid = LogisticRegression()
# подаем на вход модель, словарь с параметрами, установим, на какую метрику ориентироваться
clf = RandomizedSearchCV(estimator=lr_grid,
                         param_distributions=parameters,
                         random_state=42,
                         scoring='roc_auc',
                         cv=5,
                         verbose=0)
clf.fit(X_train_std, y_train)


RandomizedSearchCV(cv=5, estimator=LogisticRegression(),
                   param_distributions={'C': [1, 10, 100],
                                        'class_weight': ['balanced'],
                                        'max_iter': [1000],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    'none'],
                                        'random_state': [42],
                                        'solver': ['liblinear', 'saga', 'sag',
                                                   'lbfgs']},
                   random_state=42, scoring='roc_auc')

In [23]:
# получение вероятностей
y_train_grid = clf.predict_proba(X_train_std)
y_proba_grid = clf.predict_proba(X_test_std)

print(f'ROC-AUC train = {roc_auc_score(y_train, y_train_grid[:,1])}')
print(f'ROC-AUC test = {roc_auc_score(y_test, y_proba_grid[:,1])}')

ROC-AUC train = 0.8694293557619297
ROC-AUC test = 0.8687088274044795
