In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="ticks")

In [3]:
from sklearn import datasets
wine = datasets.load_wine()
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [4]:
wine_df.isna().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64

In [6]:
# Разделение на объекты-признаки и целевой признак
X = wine_df.iloc[:, :-1].values
y = wine_df.iloc[:, -1].values

In [8]:
# Формирование обучающей и тестовой выборки
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [11]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# Создание экземпляра класса KNeighborsRegressor с K=15
knn = KNeighborsRegressor(n_neighbors=15)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# MAE - средняя абсолютная ошибка
mae = mean_absolute_error(y_test, y_pred)
# RMSE - среднеквадратичная ошибка (более чувствителен к наблюдением далеким от среднего)
rmse = mean_squared_error(y_test, y_pred, squared=False)
# MAPE - средняя абсолютная ошибка в процентах
mape = mean_absolute_percentage_error(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("MAPE:", mape)

MAE: 231.86111111111111
RMSE: 281.1189043715519
MAPE: 0.3737341418083258


In [13]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, ShuffleSplit
from sklearn.metrics import mean_squared_error

knn = KNeighborsRegressor()

param_grid = {'n_neighbors': np.arange(1, 31)}

cv_strategies = [KFold(n_splits=5, shuffle=True, random_state=42),
                 ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)]

# GridSearchCV
grid_search = GridSearchCV(knn, param_grid, scoring='neg_mean_absolute_error', cv=cv_strategies[0])
grid_search.fit(X_train, y_train)

print("GridSearchCV Best MAE:", -grid_search.best_score_)
print("GridSearchCV Best Params:", grid_search.best_params_)

# RandomizedSearchCV
random_search = RandomizedSearchCV(knn, param_grid, scoring='neg_mean_absolute_error', cv=cv_strategies[1], n_iter=10, random_state=42)
random_search.fit(X_train, y_train)

print("RandomizedSearchCV Best MAE:", -random_search.best_score_)
print("RandomizedSearchCV Best Params:", random_search.best_params_)

# Обучение модели с лучшими параметрами GridSearchCV
best_model_gs = grid_search.best_estimator_
best_model_gs.fit(X_train, y_train)

# Предсказания на тестовом наборе данных
y_pred_gs = best_model_gs.predict(X_test)

# Вычисление RMSE
rmse_gs = np.sqrt(mean_squared_error(y_test, y_pred_gs))

# Вычисление MAPE
mape_gs = mean_absolute_percentage_error(y_test, y_pred_gs)

print("GridSearchCV Best RMSE:", rmse_gs)
print("GridSearchCV Best MAPE:", mape_gs)

# Обучение модели с лучшими параметрами RandomizedSearchCV
best_model_rs = random_search.best_estimator_
best_model_rs.fit(X_train, y_train)

# Предсказания на тестовом наборе данных
y_pred_rs = best_model_rs.predict(X_test)

# Вычисление RMSE
rmse_rs = np.sqrt(mean_squared_error(y_test, y_pred_rs))

# Вычисление MAPE
mape_rs = mean_absolute_percentage_error(y_test, y_pred_rs)

print("RandomizedSearchCV Best RMSE:", rmse_rs)
print("RandomizedSearchCV Best MAPE:", mape_rs)

GridSearchCV Best MAE: 171.22814039408868
GridSearchCV Best Params: {'n_neighbors': 8}
RandomizedSearchCV Best MAE: 171.7501915708812
RandomizedSearchCV Best Params: {'n_neighbors': 9}
GridSearchCV Best RMSE: 262.12011284719796
GridSearchCV Best MAPE: 0.3556902543653036
RandomizedSearchCV Best RMSE: 252.31200542098873
RandomizedSearchCV Best MAPE: 0.3453719388489157
