# 라이브러리 Import 및 데이터 Read

In [2]:
import pandas as pd
import os

In [3]:
# 경로 설정
os.chdir(r'C:\Users\user\Python_study\data')

In [4]:
# 분석 데이터 Read
iris = pd.read_csv("IRIS.csv")
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# 타겟 변수의 구성 확인
iris['species'].value_counts()

Iris-virginica     50
Iris-versicolor    50
Iris-setosa        50
Name: species, dtype: int64

In [6]:
# 아이디 생성
iris['id'] = range(len(iris))
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,id
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,1
2,4.7,3.2,1.3,0.2,Iris-setosa,2
3,4.6,3.1,1.5,0.2,Iris-setosa,3
4,5.0,3.6,1.4,0.2,Iris-setosa,4


In [7]:
# 열의 순서 변경
iris = iris[['id','sepal_length','sepal_width','petal_length','petal_width','species']]
iris.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,4.9,3.0,1.4,0.2,Iris-setosa
2,2,4.7,3.2,1.3,0.2,Iris-setosa
3,3,4.6,3.1,1.5,0.2,Iris-setosa
4,4,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
len(iris)

150

# KNN 실습(분류)

<strong>특징</strong>
- 데이터가 많으면 느리다.

<strong>파라미터</strong>
- n_neighbors: 가장 가까운 몇 개를 볼건지

<strong>knn 학습시키기</strong>

In [9]:
# iris 중에서 100개를 랜덤으로 추출해 train 데이터 지정
train = iris.sample(100,replace=False,random_state=2020).reset_index().drop(['index'],axis=1)
train.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
0,104,6.5,3.0,5.8,2.2,Iris-virginica
1,8,4.4,2.9,1.4,0.2,Iris-setosa
2,61,5.9,3.0,4.2,1.5,Iris-versicolor
3,54,6.5,2.8,4.6,1.5,Iris-versicolor
4,78,6.0,2.9,4.5,1.5,Iris-versicolor


In [10]:
# 추출되지 않은 나머지를 test 데이터로 지정
test = iris.loc[ ~iris['id'].isin(train['id']) ]
test = test.reset_index().drop(['index'],axis=1)
test.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
0,1,4.9,3.0,1.4,0.2,Iris-setosa
1,3,4.6,3.1,1.5,0.2,Iris-setosa
2,6,4.6,3.4,1.4,0.3,Iris-setosa
3,9,4.9,3.1,1.5,0.1,Iris-setosa
4,20,5.4,3.4,1.7,0.2,Iris-setosa


In [11]:
# knn을 위한 패키지
from sklearn.neighbors import KNeighborsClassifier

# 모델 정의
knn = KNeighborsClassifier(n_neighbors=2)

In [12]:
# knn 모델을 train 데이터에서 학습
knn.fit( train[['sepal_length','sepal_width','petal_length','petal_width']] , train['species'] )

KNeighborsClassifier(n_neighbors=2)

In [13]:
# 예측 진행
predictions = knn.predict( test[['sepal_length','sepal_width','petal_length','petal_width']] )

In [14]:
test['pred'] = predictions
test.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species,pred
0,1,4.9,3.0,1.4,0.2,Iris-setosa,Iris-setosa
1,3,4.6,3.1,1.5,0.2,Iris-setosa,Iris-setosa
2,6,4.6,3.4,1.4,0.3,Iris-setosa,Iris-setosa
3,9,4.9,3.1,1.5,0.1,Iris-setosa,Iris-setosa
4,20,5.4,3.4,1.7,0.2,Iris-setosa,Iris-setosa


In [15]:
# 정확도 확인
(pd.Series(predictions) == test['species']).mean()

0.94

In [16]:
for k in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit( train[['sepal_length','sepal_width','petal_length','petal_width']] , train['species'] )
    predictions = knn.predict( test[['sepal_length','sepal_width','petal_length','petal_width']] )
    #print((pd.Series(predictions) == test['species']).mean())

In [17]:
from sklearn.model_selection import cross_val_score
import numpy as np

<strong>최적의 k 찾기 (Cross Validation 사용)</strong>

In [18]:
for k in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, iris[['sepal_length','sepal_width','petal_length','petal_width']], iris['species'], cv=5)
    print(np.mean(scores))

0.96
0.9466666666666665
0.9666666666666668
0.9733333333333334
0.9733333333333334
0.9800000000000001
0.9800000000000001
0.9666666666666668
0.9733333333333334
0.9800000000000001
0.9800000000000001
0.9800000000000001
0.9733333333333334
0.9666666666666666
0.9666666666666668
0.9666666666666668
0.9666666666666668
0.9666666666666666
0.9666666666666668
0.96
0.9666666666666668
0.96
0.96
0.9466666666666667
0.96
0.9466666666666665
0.9466666666666667
0.9399999999999998
0.9333333333333332


#### 최적의 k는 6

# KNN 실습(회귀)

In [20]:
# 회귀를 위해 필요한 컬럼을 제거
del train['species']
del test['species']

In [21]:
from sklearn.neighbors import KNeighborsRegressor

# 모델 정의
knn = KNeighborsRegressor(n_neighbors=2)

In [22]:
# knn 모델을 학습

knn.fit( train[['sepal_length','sepal_width','petal_length']] , train['petal_width'] )

KNeighborsRegressor(n_neighbors=2)

In [23]:
# 예측

predictions = knn.predict( test[['sepal_length','sepal_width','petal_length']] )
predictions

array([0.2 , 0.2 , 0.2 , 0.1 , 0.3 , 0.25, 0.25, 0.2 , 0.25, 0.15, 0.2 ,
       0.2 , 1.55, 1.4 , 1.25, 1.65, 1.35, 1.35, 1.25, 1.4 , 1.15, 1.4 ,
       1.65, 1.15, 1.15, 1.25, 1.9 , 1.3 , 1.65, 1.25, 1.  , 2.35, 2.3 ,
       2.  , 2.15, 2.05, 1.9 , 2.05, 2.4 , 2.4 , 1.8 , 2.1 , 2.35, 1.85,
       2.  , 2.05, 1.8 , 2.3 , 2.15, 2.  ])

In [24]:
test['pred'] = predictions
test.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,pred
0,1,4.9,3.0,1.4,0.2,0.2
1,3,4.6,3.1,1.5,0.2,0.2
2,6,4.6,3.4,1.4,0.3,0.2
3,9,4.9,3.1,1.5,0.1,0.1
4,20,5.4,3.4,1.7,0.2,0.3


<strong>MAE</strong>

회귀문제에서 모델의 성능을 평가하는 방법

In [25]:
abs(test['petal_width'] - pd.Series(predictions)).mean()

0.16699999999999995

<strong>최적의 k 찾기</strong>

In [26]:
for k in range(1,30):
    knn = KNeighborsRegressor(n_neighbors=k)

    knn.fit( train[['sepal_length','sepal_width','petal_length']] , train['petal_width'] )

    predictions = knn.predict( test[['sepal_length','sepal_width','petal_length']] )
    
    print(str(k)+' :'+str(abs(test['petal_width'] - pd.Series(predictions)).mean()))

1 :0.196
2 :0.16699999999999995
3 :0.1673333333333333
4 :0.1655
5 :0.16839999999999994
6 :0.1619999999999999
7 :0.16342857142857137
8 :0.16125
9 :0.16155555555555556
10 :0.16260000000000002
11 :0.16254545454545455
12 :0.1635
13 :0.1636923076923077
14 :0.1695714285714286
15 :0.1738666666666667
16 :0.17587499999999995
17 :0.18282352941176455
18 :0.18955555555555553
19 :0.1913684210526316
20 :0.19329999999999994
21 :0.19780952380952382
22 :0.20045454545454547
23 :0.20330434782608697
24 :0.2045
25 :0.2084800000000001
26 :0.2123076923076923
27 :0.21599999999999997
28 :0.21878571428571422
29 :0.22365517241379315


최적의 k는 8