Необходимые импорты

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score
import seaborn as sns

Загрузка данных

In [2]:
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names

print(f"Размер датасета: {X.shape}")
print(f"Количество признаков: {len(feature_names)}")
print(f"Целевые классы: {target_names}")
print(f"Соотношение классов: {np.bincount(y)}")


Размер датасета: (569, 30)
Количество признаков: 30
Целевые классы: ['malignant' 'benign']
Соотношение классов: [212 357]


Разделение на тренировочную и тестовую выборку


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain size: {X_train.shape}, Test size: {X_test.shape}")



Train size: (455, 30), Test size: (114, 30)


In [4]:
from knn import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier as KNeighborsClassifier_sklearn

модели

In [None]:
обучение

In [5]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', p=2)
knn_sk = KNeighborsClassifier_sklearn(n_neighbors=5, weights='distance', p=2)
knn.fit(X_train, y_train)
knn_sk.fit(X_train, y_train)

предсказания

Используем реколл, так как не хотим пропустить больных при обследовании


In [6]:
y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)[:, 1]

y_pred_sk = knn_sk.predict(X_test)
y_pred_proba_sk = knn_sk.predict_proba(X_test)[:, 1]

In [7]:
print("Реколл для больных моей реализации", recall_score(y_test, y_pred, pos_label=0))
print("Реколл для больных склерн реализации", recall_score(y_test, y_pred_sk, pos_label=0))


Реколл для больных моей реализации 0.9047619047619048
Реколл для больных склерн реализации 0.9047619047619048


Нормализуем данные

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

С помощью кросс валидации подбираем гиперпараметры


In [9]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier_sklearn())
])

param_grid = {
    'knn__n_neighbors': list(range(1, 31, 2)),
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='recall',
    n_jobs=-1,
    verbose=1
)

print("\nНачинаем поиск оптимальных параметров...")
grid_search.fit(X_train, y_train)

print(f"\nЛучшие параметры: {grid_search.best_params_}")
print(f"Лучшая точность (CV): {grid_search.best_score_:.4f}")



Начинаем поиск оптимальных параметров...
Fitting 5 folds for each of 60 candidates, totalling 300 fits

Лучшие параметры: {'knn__n_neighbors': 9, 'knn__p': 2, 'knn__weights': 'uniform'}
Лучшая точность (CV): 0.9895


Тестируем на лучшей модели

In [11]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

In [14]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("\n" + "="*50)
print("ОЦЕНКА МОДЕЛИ sklearn НА ТЕСТОВОЙ ВЫБОРКЕ")
print("="*50)

recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


ОЦЕНКА МОДЕЛИ sklearn НА ТЕСТОВОЙ ВЫБОРКЕ
Recall: 1.0000
Confusion Matrix:
[[39  3]
 [ 0 72]]


Используем те же параметры на своей модели

In [15]:
knn = KNeighborsClassifier(n_neighbors=9, weights='uniform', p=2)

In [16]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [17]:
print("\n" + "="*50)
print("ОЦЕНКА МОДЕЛИ sklearn НА ТЕСТОВОЙ ВЫБОРКЕ")
print("="*50)

recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


ОЦЕНКА МОДЕЛИ sklearn НА ТЕСТОВОЙ ВЫБОРКЕ
Recall: 1.0000
Confusion Matrix:
[[39  3]
 [ 0 72]]


Загружаем данные

In [20]:
from sklearn import datasets
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target

Делим на выборки

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Обучаем бейзлайн

In [24]:
from sklearn.neighbors import KNeighborsRegressor as KNeighborsRegressor_sk
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [26]:
knn_regressor = KNeighborsRegressor_sk(
    n_neighbors=5,
    weights='uniform',
    algorithm='auto',
    p=2,
    metric='minkowski'
)

knn_regressor.fit(X_train, y_train)

y_pred = knn_regressor.predict(X_test)

In [29]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"mse = {mse}")
print(f"mae = {mae}")
print(f"r2 = {r2}")

mse = 3019.075505617978
mae = 42.77078651685394
r2 = 0.43016439526042805


In [30]:
from knn import KNeighborsRegressor

In [33]:
knn_regressor = KNeighborsRegressor(
    n_neighbors=5,
    weights='uniform',
    p=2,
)

knn_regressor.fit(X_train, y_train)

y_pred = knn_regressor.predict(X_test)

In [34]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"mse = {mse}")
print(f"mae = {mae}")
print(f"r2 = {r2}")

mse = 3019.075505617978
mae = 42.77078651685394
r2 = 0.43016439526042805


In [None]:
Работаем над данными

In [35]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [None]:
Подбираем параметры

In [37]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor_sk())
])

param_grid = {
    'knn__n_neighbors': list(range(1, 31, 2)),
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

print("\nНачинаем поиск оптимальных параметров...")
grid_search.fit(X_train, y_train)

print(f"\nЛучшие параметры: {grid_search.best_params_}")
print(f"Лучшая точность (CV): {grid_search.best_score_:.4f}")



Начинаем поиск оптимальных параметров...
Fitting 5 folds for each of 60 candidates, totalling 300 fits

Лучшие параметры: {'knn__n_neighbors': 19, 'knn__p': 2, 'knn__weights': 'distance'}
Лучшая точность (CV): -3443.2888


Предсказываем на лучшей модели

In [39]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [40]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"mse = {mse}")
print(f"mae = {mae}")
print(f"r2 = {r2}")

mse = 2960.065382674736
mae = 44.73469527558608
r2 = 0.4413022647938486


Видим улучшение метрик


In [42]:
model = KNeighborsRegressor(n_neighbors=19, weights='distance',p=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [43]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"mse = {mse}")
print(f"mae = {mae}")
print(f"r2 = {r2}")

mse = 2960.065382999292
mae = 44.73469527917388
r2 = 0.44130226473259027


Аналогично