### Оптимизация алгоритмов машинного обучения с использованием конвейеров (pipeline)

#### Анализ данных

Импорт библиотек

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

Загружаем данные

In [None]:
data = pd.read_csv("data.csv")

Общая информация о датасете

In [None]:
print(data.info())

In [None]:
print(data.head())

Статистическое описание числовых столбцов

In [None]:
print(data.describe())

Проверка пропусков

In [None]:
print(data.isnull().sum())

Разделение на признаки и целевую переменную

In [None]:
X = data.drop("target", axis=1)
y = data["target"]

Разбиение на обучение и тест

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Размер train:", X_train.shape)
print("Размер test:", X_test.shape)

#### Pipeline

Библиотеки

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

Определяем типы признаков

In [None]:
num_features = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_features = X_train.select_dtypes(include=["object"]).columns

Предобработка

In [None]:
preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

Базовая модель

In [None]:
rf = RandomForestClassifier(random_state=42)

Базовый Pipeline

In [None]:
base_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", rf)
])

Обучение базовой модели

In [None]:
base_pipeline.fit(X_train, y_train)

Оценка базового качества

In [None]:
score = base_pipeline.score(X_test, y_test)
print("Базовая точность:", score)

#### GridSearchCV

Библиотеки

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
import pandas as pd

Разделение на признаки и цель

In [None]:
X = data.drop("target", axis=1)
y = data["target"]

Выделяем типы признаков

In [None]:
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

Предобработка

In [None]:
preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

Базовая модель

In [None]:
rf = RandomForestClassifier(random_state=42)

Конвейер

In [None]:
pipe = Pipeline([
    ("prep", preprocess),
    ("model", rf)
])

Сетка параметров

In [None]:
param_grid = {
    "model__n_estimators": [50, 100],
    "model__max_depth": [5, 10, None],
    "model__min_samples_split": [2, 5]
}

GridSearchCV

In [None]:
grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1)
grid.fit(X, y)

Результат

In [None]:
print("Лучшие параметры:", grid.best_params_)
print("Лучшая точность:", grid.best_score_)