# Aplicando o KNN na Base de Dados Wine (amostras de vinho)

## – Importando as bibliotecas necessárias:

In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# – Carregando a base de dados através do Sklearn:

In [2]:
wine = datasets.load_wine()

In [3]:
df_wine = pd.DataFrame(data=wine.data, columns=wine.feature_names)

In [4]:
df_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [5]:
df_wine['class'] = wine.target

In [6]:
df_wine.head().T

Unnamed: 0,0,1,2,3,4
alcohol,14.23,13.2,13.16,14.37,13.24
malic_acid,1.71,1.78,2.36,1.95,2.59
ash,2.43,2.14,2.67,2.5,2.87
alcalinity_of_ash,15.6,11.2,18.6,16.8,21.0
magnesium,127.0,100.0,101.0,113.0,118.0
total_phenols,2.8,2.65,2.8,3.85,2.8
flavanoids,3.06,2.76,3.24,3.49,2.69
nonflavanoid_phenols,0.28,0.26,0.3,0.24,0.39
proanthocyanins,2.29,1.28,2.81,2.18,1.82
color_intensity,5.64,4.38,5.68,7.8,4.32


In [7]:
df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  class

In [8]:
df_wine['class'].value_counts()

1    71
0    59
2    48
Name: class, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_wine.drop('class', axis=1), df_wine['class'], test_size=0.3)

In [11]:
from sklearn.neighbors import KNeighborsClassifier

In [12]:
knn = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)

In [13]:
result = knn.predict(X_test)

In [14]:
print(pd.crosstab(y_test,result, rownames=['Real'], colnames=['Predito'], margins=True))

Predito   0   1  2  All
Real                   
0        17   1  0   18
1         3  14  5   22
2         5   5  4   14
All      25  20  9   54


In [15]:
from sklearn import metrics

In [16]:
print(metrics.classification_report(y_test, result, target_names=wine.target_names))

              precision    recall  f1-score   support

     class_0       0.68      0.94      0.79        18
     class_1       0.70      0.64      0.67        22
     class_2       0.44      0.29      0.35        14

    accuracy                           0.65        54
   macro avg       0.61      0.62      0.60        54
weighted avg       0.63      0.65      0.63        54



## Otimizando o parâmetro k utilizando o GridSearch:

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
# Definindo a lista de valores.
k_list = list(range(1, 31))

In [19]:
# Colocamos os valores em um dicionário
parametros = dict(n_neighbors=k_list)

In [20]:
grid = GridSearchCV(knn, parametros, cv=5, scoring='accuracy')

In [21]:
grid.fit(df_wine.drop('class',axis=1),df_wine['class'])

GridSearchCV(cv=5, estimator=KNeighborsClassifier(n_neighbors=3),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30]},
             scoring='accuracy')

In [22]:
print("Melhores parametros {} com o valor de acurácia {} ".format(grid.best_params_,grid.best_score_))

Melhores parametros {'n_neighbors': 1} com o valor de acurácia 0.7250793650793651 


Referencias :
https://minerandodados.com.br/machine-learning-na-pratica-knn-python/