Необходимые импорты

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score
import seaborn as sns

Загрузка данных

In [2]:
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names

print(f"Размер датасета: {X.shape}")
print(f"Количество признаков: {len(feature_names)}")
print(f"Целевые классы: {target_names}")
print(f"Соотношение классов: {np.bincount(y)}")


Размер датасета: (569, 30)
Количество признаков: 30
Целевые классы: ['malignant' 'benign']
Соотношение классов: [212 357]


Разделение на тренировочную и тестовую выборку


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain size: {X_train.shape}, Test size: {X_test.shape}")



Train size: (455, 30), Test size: (114, 30)


In [4]:
from random_forest_classifier import RandomForestClassifier as rf
from sklearn.ensemble import RandomForestClassifier as rfsk

модели

Обучение

In [5]:
lr = rf(n_estimators=100, max_depth=5)
lrsk = rfsk()
lr.fit(X_train, y_train)
lrsk.fit(X_train, y_train)

предсказания

Используем реколл, так как не хотим пропустить больных при обследовании


In [6]:
y_pred = lr.predict(X_test)
y_pred_proba = lr.predict_proba(X_test)

y_pred_sk = lrsk.predict(X_test)
y_pred_proba_sk = lrsk.predict_proba(X_test)

In [7]:
print("Реколл для больных моей реализации", recall_score(y_test, y_pred, pos_label=0))
print("Реколл для больных склерн реализации", recall_score(y_test, y_pred_sk, pos_label=0))


Реколл для больных моей реализации 0.9285714285714286
Реколл для больных склерн реализации 0.9285714285714286


Нормализуем данные

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

С помощью кросс валидации подбираем гиперпараметры


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Создание пайплайна для случайного леса
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('forest', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid = {
    'forest__n_estimators': [100, 200],
    'forest__max_depth': [15, 30, None],
    'forest__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='recall',
    n_jobs=-1,
    verbose=1
)

print("\nПоиск параметров для леса...")
grid_search.fit(X_train, y_train)

print(f"\nЛучшие параметры: {grid_search.best_params_}")
print(f"Лучший recall (CV): {grid_search.best_score_:.4f}")

best_forest_model = grid_search.best_estimator_

from sklearn.metrics import recall_score, classification_report

y_pred = best_forest_model.predict(X_test)
test_recall = recall_score(y_test, y_pred)
print(f"\nRecall на тесте: {test_recall:.4f}")
print("\nОтчет по классификации:")
print(classification_report(y_test, y_pred))


Поиск параметров для леса...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Лучшие параметры: {'forest__max_depth': 15, 'forest__min_samples_split': 2, 'forest__n_estimators': 100}
Лучший recall (CV): 0.9684

Recall на тесте: 0.9722

Отчет по классификации:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



Тестируем на лучшей модели

In [11]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("\n" + "="*50)
print("ОЦЕНКА МОДЕЛИ sklearn НА ТЕСТОВОЙ ВЫБОРКЕ")
print("="*50)

recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


ОЦЕНКА МОДЕЛИ sklearn НА ТЕСТОВОЙ ВЫБОРКЕ
Recall: 0.9722
Confusion Matrix:
[[39  3]
 [ 2 70]]


Используем те же параметры на своей модели

In [12]:
lr = rf(max_depth=15)

In [13]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [14]:
print("\n" + "="*50)
print("ОЦЕНКА МОДЕЛИ sklearn НА ТЕСТОВОЙ ВЫБОРКЕ")
print("="*50)

recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


ОЦЕНКА МОДЕЛИ sklearn НА ТЕСТОВОЙ ВЫБОРКЕ
Recall: 0.9583
Confusion Matrix:
[[39  3]
 [ 3 69]]


Загружаем данные

In [1]:
from sklearn import datasets
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target

Делим на выборки

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Обучаем бейзлайн

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [6]:
dt = RandomForestRegressor()

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

In [7]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"mse = {mse}")
print(f"mae = {mae}")
print(f"r2 = {r2}")

mse = 3057.6674528089893
mae = 44.95606741573034
r2 = 0.42288035565136695


In [8]:
from random_forest_regressor import RandomForestRegressor as rfr

In [9]:
dt = rfr()

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

In [10]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"mse = {mse}")
print(f"mae = {mae}")
print(f"r2 = {r2}")

mse = 2953.7699393258426
mae = 44.29033707865169
r2 = 0.44249049866250967


In [14]:
Работаем над данными

SyntaxError: invalid syntax (ipython-input-460497718.py, line 1)

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [None]:
Подбираем параметры

In [13]:
from sklearn.ensemble import RandomForestRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('forest', RandomForestRegressor(random_state=42, n_jobs=-1))
])

param_grid = {
    'forest__n_estimators': [100, 200],
    'forest__max_depth': [10, 20, None],
    'forest__min_samples_leaf': [1, 5]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

print("\nПоиск параметров для случайного леса...")
grid_search.fit(X_train, y_train)

print(f"\nЛучшие параметры: {grid_search.best_params_}")
print(f"Лучший отрицательный MSE (CV): {grid_search.best_score_:.4f}")
print(f"Лучший MSE (CV): {-grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_


Поиск параметров для случайного леса...
Fitting 3 folds for each of 12 candidates, totalling 36 fits

Лучшие параметры: {'forest__max_depth': 10, 'forest__min_samples_leaf': 5, 'forest__n_estimators': 200}
Лучший отрицательный MSE (CV): -3449.9556
Лучший MSE (CV): 3449.9556


Предсказываем на лучшей модели

In [14]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [15]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"mse = {mse}")
print(f"mae = {mae}")
print(f"r2 = {r2}")

mse = 2874.9676899867145
mae = 43.41784168440716
r2 = 0.4573640343933788


Видим улучшение метрик


In [16]:
model = rfr(max_depth=10, min_samples_leaf=5, n_estimators=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [17]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"mse = {mse}")
print(f"mae = {mae}")
print(f"r2 = {r2}")

mse = 2784.9392384190633
mae = 42.981549123176706
r2 = 0.47435646040172375


Аналогично