<a href="https://colab.research.google.com/github/KilHwanKim/red-wine-prediction-using-KNN/blob/main/wine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#####KNN 알고리즘을 이용하여 레드와인의 품질을 예측하려고 한다.

**목차**  
**1. 데이터 불러오기**

In [1]:
### 코렙 구글드라이브 연동
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [2]:
import numpy as np## 선형데이터 표현
import pandas as pd## 데이터 전처리
import matplotlib.pyplot as plt
from itertools import combinations
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

##1.데이터 불러오기

In [3]:
wine = pd.read_csv("/gdrive/My Drive/머신러닝/wine/winequality-red.csv")

In [4]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
len(wine)## 1599개의 데이터

1599

In [6]:
features = [x for x in wine]
y = wine[features.pop()] ## quality 특성만 추출
x = wine[features] 
y

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1599, dtype: int64

##2.데이터 전처리
###a. null 값 존재 시 제거

In [7]:
wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

null 값은 존재 하지 않는다.


###b. 표준화


> 다음과 같이 특성별로 평균과 분산이 모두 다르다.





In [8]:
wine[["total sulfur dioxide","volatile acidity"]].var() ## 특정 속성의 분산

total sulfur dioxide    1082.102373
volatile acidity           0.032062
dtype: float64

In [9]:
wine[["total sulfur dioxide","volatile acidity"]].mean() ## 특정 속성의 분산

total sulfur dioxide    46.467792
volatile acidity         0.527821
dtype: float64

즉 두 특성의 1 값은 가치가 다르다.  
("total sulfur dioxide"에서 1은 작은 값이지만 "volatile acidity"에선 매우 큰 값이다.)  
이러한 점은 데이터 예측에 악영향을 주므로 표준화를 시켜준다.

In [10]:
x = StandardScaler().fit_transform(x)

##3.데이터 모델링
###a. 모든 특성 사용

In [11]:
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.10 , random_state=3,shuffle=True)
### test_size 는 테스트10% 훈련데이터 90%
Knn = KNeighborsClassifier (n_neighbors=1,p=1)
Knn.fit(train_x,train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=1,
                     weights='uniform')

In [12]:
y_pred=Knn.predict(test_x)
y_pred

array([6, 6, 6, 8, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 7, 5, 6, 5, 7, 7, 7, 5,
       7, 6, 5, 5, 3, 6, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 5, 6, 6, 5, 7, 6,
       5, 5, 6, 5, 6, 6, 4, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 6, 6,
       5, 6, 5, 6, 4, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 4, 5, 7, 5, 5, 5, 5,
       5, 5, 6, 6, 5, 5, 5, 6, 5, 5, 6, 7, 5, 5, 7, 8, 6, 5, 5, 5, 6, 7,
       7, 7, 6, 6, 5, 5, 6, 5, 6, 6, 6, 5, 6, 5, 6, 6, 7, 6, 5, 5, 5, 6,
       6, 5, 7, 5, 7, 6, 5, 6, 6, 7, 5, 5, 6, 5, 6, 6, 7, 5, 5, 6, 6, 5,
       5, 6, 5, 6, 5, 6])

In [13]:
test_y

147     5
937     4
877     6
1000    7
73      4
       ..
816     6
1282    6
1013    6
153     5
876     4
Name: quality, Length: 160, dtype: int64

In [14]:
print('정확도 : ',100*accuracy_score(test_y,y_pred))

정확도 :  67.5


모든 데이터를 사용하는 것은 오히려 방해가 될 수도 있기 때문에 데이터를 선택하여 정확도를 비교한다.

###b.데이터 분산순으로 선택

In [15]:
var_df = pd.DataFrame({"특성":[],"분산": []})
var_list = [[feat,wine[feat].var()]for feat in features]
var_list.sort(key=lambda x: -x[-1])
for i in range(1,len(var_list)+1):
    var_df.loc[i] = var_list[i-1]
display(var_df) ### 특성별 분산 

Unnamed: 0,특성,분산
1,total sulfur dioxide,1082.102373
2,free sulfur dioxide,109.414884
3,fixed acidity,3.031416
4,residual sugar,1.987897
5,alcohol,1.135647
6,citric acid,0.037947
7,volatile acidity,0.032062
8,sulphates,0.028733
9,pH,0.023835
10,chlorides,0.002215


분산 순으로 데이터를 추가하면서 정확도를 비교하도록 하자

In [16]:
var_ac = pd.DataFrame({"정확도": []})
for ind in range(2,len(var_list)+1) :
    x = wine[[i[0] for i in var_list[:ind]]]
    train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.10 , random_state=3,shuffle=True)
    Knn = KNeighborsClassifier (n_neighbors=1,p=1)
    Knn.fit(train_x,train_y)
    y_pred=Knn.predict(test_x)
    y_pred
    var_ac.loc[ind]=[100*accuracy_score(test_y,y_pred)]
display(var_ac)

Unnamed: 0,정확도
2,54.375
3,60.625
4,61.25
5,60.0
6,59.375
7,63.125
8,62.5
9,62.5
10,61.875
11,61.875


특성 7개를 사용했을 때 가장 높은 정확도 63.125가 나왔다.  
('total sulfur dioxide',
 'free sulfur dioxide',
 'fixed acidity',
 'residual sugar',
 'alcohol',
 'citric acid',
 'volatile acidity')





###c. 각각의 특성을 따로 계산


> 각각의 하나의 특성만으로 정확도를 구하고 그 정확도 순으로 특성을 추가하며 정확도를 비교한다.



In [17]:
one_ac = pd.DataFrame({"특성":[],"정확도": []})
wine["test"]=0
one = []
for feat in features :
    x = wine[[feat,"test"]]
    train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.10 , random_state=3,shuffle=True)
    Knn = KNeighborsClassifier (n_neighbors=1,p=1)
    Knn.fit(train_x,train_y)
    y_pred=Knn.predict(test_x)
    y_pred
    one.append([feat,100*accuracy_score(test_y,y_pred)])
one.sort(key=lambda x:  -x[-1])
for ind,o in enumerate(one):
    one_ac.loc[ind]=o
del wine["test"]
display(one_ac)

Unnamed: 0,특성,정확도
0,alcohol,54.375
1,citric acid,46.875
2,density,46.875
3,volatile acidity,45.0
4,total sulfur dioxide,44.375
5,sulphates,41.875
6,fixed acidity,41.25
7,chlorides,40.0
8,pH,38.75
9,free sulfur dioxide,36.25


정확도가 높은 순서대로 특성을 추가하여 정확도를 구한다.

In [18]:
df = pd.DataFrame({"정확도": []})
for ind in range(2,len(one)+1) :
    x = wine[[i[0] for i in one[:ind]]]
    train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.10 , random_state=3,shuffle=True)
    Knn = KNeighborsClassifier (n_neighbors=1,p=1)
    Knn.fit(train_x,train_y)
    y_pred=Knn.predict(test_x)
    y_pred
    var_ac.loc[ind]=[100*accuracy_score(test_y,y_pred)]
display(var_ac)

Unnamed: 0,정확도
2,54.375
3,62.5
4,60.625
5,65.0
6,65.0
7,62.5
8,63.125
9,64.375
10,63.125
11,61.875


특성 5개를 사용했을 때 가장 높은 정확도65.000가 나왔다.  
('alcohol',
 'citric acid',
 'density',
 'volatile acidity',
 'total sulfur dioxide')

###d. 가장 좋은 방법 모든 경우의 수

> 가장 좋은 방법으로 조합될 수 있는 모든 경우의 수를 따져주는 것이다.  
 (k 값 변경하여 진행)



In [19]:
answer = []
for number in range(1,len(features)+1):
    for val in list(combinations(features,number)):
        x = wine[list(val)]
        x = StandardScaler().fit_transform(x)
        train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.10 , random_state=3,shuffle=True)
        for k in range(1,20):
            Knn = KNeighborsClassifier (n_neighbors=k,p=1)
            Knn.fit(train_x,train_y)
            y_pred=Knn.predict(test_x)
            y_pred
            answer.append([k,val,100*accuracy_score(test_y,y_pred)])
answer.sort(key = lambda x: -x[-1])

In [20]:
df = pd.DataFrame({"특성":[],"K":[],"정확도": []})
pd.set_option("display.max_colwidth",-1)
for i in range(5):
    df.loc[i] = [answer[i][1],pd.to_numeric(answer[i][0]),answer[i][-1]]
display(df)

  
  return array(a, dtype, copy=False, order=order)


Unnamed: 0,특성,K,정확도
0,"(fixed acidity, volatile acidity, residual sugar, total sulfur dioxide, density, pH)",1.0,73.125
1,"(volatile acidity, residual sugar, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates)",5.0,73.125
2,"(fixed acidity, citric acid, residual sugar, total sulfur dioxide, alcohol)",1.0,71.875
3,"(volatile acidity, residual sugar, free sulfur dioxide, pH, sulphates, alcohol)",1.0,71.875
4,"(fixed acidity, volatile acidity, citric acid, residual sugar, free sulfur dioxide, density, sulphates)",1.0,71.875


6개의 특성을 K=1로 KNN을 진행 했을 때  73.125%라는 가장 높은 수치가 나왔다.
(fixed acidity, volatile acidity, residual sugar, total sulfur dioxide, density, pH)  


>그러나 시간이 많이 소모되었다. 특성이나 데이터 갯수가 더 많은 모델에서는 사용하지 못할 것으로 생각된다.

