В этом задании следует найти оптимальное значение k для алгоритма k ближайших соседей. Решим следующую задачу классификации: необходимо предсказать сорт винограда, из которого сделано вино, зная его химические характеристики.

Загрузим выборку wine.csv и дадим названия стобцам признаков

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale

In [2]:
data = pd.read_csv(filepath_or_buffer='wine.csv')
data.columns = [
				"Label of class",
				"Alcohol",
				"Malic acid",
				"Ash",
				"Alcalinity of ash",
				"Magnesium",
				"Total phenols",
				"Flavanoids",
				"Nonflavanoid phenols",
				"Proanthocyanins",
				"Color intensity",
				"Hue",
				"0D280/0D315 of diluted wines",
				"Proline"
			   ]

data

Unnamed: 0,Label of class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,0D280/0D315 of diluted wines,Proline
0,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
1,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
2,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
3,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
4,1,14.20,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
173,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
174,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
175,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


Разделим выборку на столбцы признаков и столбец ответов. Ответы записаны в 1-ом столбце, признаки - в стобцах со 2-ого до последнего.

In [3]:
data_y = data[data.columns[0]]
data_X = data[data.columns[1:]]
data_X, data_y

(     Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  Total phenols  \
 0      13.20        1.78  2.14               11.2        100           2.65   
 1      13.16        2.36  2.67               18.6        101           2.80   
 2      14.37        1.95  2.50               16.8        113           3.85   
 3      13.24        2.59  2.87               21.0        118           2.80   
 4      14.20        1.76  2.45               15.2        112           3.27   
 ..       ...         ...   ...                ...        ...            ...   
 172    13.71        5.65  2.45               20.5         95           1.68   
 173    13.40        3.91  2.48               23.0        102           1.80   
 174    13.27        4.28  2.26               20.0        120           1.59   
 175    13.17        2.59  2.37               20.0        120           1.65   
 176    14.13        4.10  2.74               24.5         96           2.05   
 
      Flavanoids  Nonflavanoid phenols

Используем метод K-ближайших соседей

In [4]:
k_blocks = 10
scores = np.empty((100, k_blocks))
for k in range(1, 101):
    clf = KNeighborsClassifier(n_neighbors=k)
    scores[k-1] = cross_val_score(clf, data_X, data_y, cv=k_blocks)

Теперь найдем среднее арифметическое от оценок качества для каждого k

In [5]:
sumed_scores = np.sum(scores, axis=1) / k_blocks
max_score = sumed_scores.max()
ind_of_max_score, = np.where(sumed_scores == max_score)
sumed_scores, max_score, ind_of_max_score + 1

(array([0.75228758, 0.67352941, 0.71372549, 0.6620915 , 0.68496732,
        0.66732026, 0.66764706, 0.69509804, 0.69019608, 0.70065359,
        0.70065359, 0.68954248, 0.70065359, 0.68954248, 0.71830065,
        0.70718954, 0.70098039, 0.7127451 , 0.70686275, 0.68431373,
        0.72908497, 0.70098039, 0.71764706, 0.70098039, 0.7127451 ,
        0.70098039, 0.71830065, 0.71830065, 0.71830065, 0.72385621,
        0.7127451 , 0.70718954, 0.70718954, 0.7130719 , 0.70718954,
        0.70130719, 0.70718954, 0.70163399, 0.70130719, 0.7130719 ,
        0.69575163, 0.69575163, 0.70163399, 0.69607843, 0.69607843,
        0.69019608, 0.69607843, 0.69607843, 0.69607843, 0.70784314,
        0.70163399, 0.70163399, 0.70163399, 0.70751634, 0.70196078,
        0.71339869, 0.71339869, 0.70751634, 0.69607843, 0.70751634,
        0.70751634, 0.70751634, 0.69640523, 0.69640523, 0.69084967,
        0.69640523, 0.69640523, 0.69640523, 0.70228758, 0.69673203,
        0.69673203, 0.69117647, 0.69117647, 0.70

Промасштабируем признаки

In [6]:
data_scaled_X = scale(data_X)
scores = np.empty((100, k_blocks))
for k in range(1, 101):
    clf = KNeighborsClassifier(n_neighbors=k)
    scores[k-1] = cross_val_score(clf, data_scaled_X, data_y, cv=k_blocks)

In [7]:
sumed_scores = np.sum(scores, axis=1) / k_blocks
max_score = sumed_scores.max()
ind_of_max_score, = np.where(sumed_scores == max_score)
sumed_scores, max_score, ind_of_max_score + 1

(array([0.94346405, 0.9379085 , 0.95457516, 0.94901961, 0.96633987,
        0.95490196, 0.96078431, 0.96633987, 0.97745098, 0.97189542,
        0.97156863, 0.96078431, 0.96633987, 0.96666667, 0.96633987,
        0.96078431, 0.97222222, 0.96633987, 0.97222222, 0.97222222,
        0.97222222, 0.97777778, 0.98333333, 0.98333333, 0.97189542,
        0.96045752, 0.96601307, 0.96601307, 0.96601307, 0.96601307,
        0.96601307, 0.97189542, 0.97189542, 0.97745098, 0.97189542,
        0.96633987, 0.96633987, 0.96633987, 0.96045752, 0.96045752,
        0.96633987, 0.96633987, 0.96633987, 0.96045752, 0.96045752,
        0.96045752, 0.96045752, 0.96045752, 0.96045752, 0.96633987,
        0.96045752, 0.96045752, 0.96045752, 0.95490196, 0.96045752,
        0.96045752, 0.96045752, 0.96633987, 0.96633987, 0.96078431,
        0.96078431, 0.96078431, 0.95522876, 0.95522876, 0.95522876,
        0.94411765, 0.95522876, 0.95522876, 0.96666667, 0.95522876,
        0.96666667, 0.96111111, 0.96666667, 0.96

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


clf = KNeighborsClassifier(n_neighbors=23)
X_train, X_test, y_train, y_test = train_test_split(data_scaled_X, data_y, test_size=0.2, random_state=200)
clf.fit(X_train, y_train)
y_new = clf.predict(X_test)
accuracy_score(y_test, y_new)

0.9722222222222222