# 8일차 과제 : KNN
# k(이웃 수)가 3~30까지 변화할 때까지 정확도(accuracy)의 변화를 그림으로 나타내고 최적의 이웃 수를 산출하라. 
# 데이터 : 유방암 데이터(변수: 31개, 자료 수: 569개)
# -.목표변수(diagnosis, 0: Malignant(악성,212개), 1: Benign(양성,357개)

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
matplotlib.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus']=False

In [None]:
ds_hmeq = pd.read_csv('./Downloads/week4day1/HMEQ.csv',engine='python')
ds_hmeq.head()


In [None]:
ds_hmeq.info()

In [None]:
#결측치 확인
ds_hmeq.isnull().sum(axis=0)

In [None]:
#fillna : 결측치 채우는 함수
ds_hmeq["JOB"].fillna("Other", inplace=True)
#아예 비면 빠지므로 아더 입력
ds_hmeq.fillna(ds_hmeq.mean(), inplace=True)
#연속형 데이터에 평균값 입력

In [None]:
ds_hmeq_dummy=pd.get_dummies(ds_hmeq)
ds_hmeq_dummy.head()

In [None]:
np.random.seed(seed=1234)
msk=np.random.rand(ds_hmeq_dummy.shape[0]) <0.7
ds_hmeq_train=ds_hmeq_dummy[msk]
ds_hmeq_test=ds_hmeq_dummy[~msk]

ds_hmeq_train_y=ds_hmeq_train["BAD"]
ds_hmeq_train_x=ds_hmeq_train.drop("BAD", axis=1, inplace=False)
ds_hmeq_test_y=ds_hmeq_test["BAD"]
ds_hmeq_test_x=ds_hmeq_test.drop("BAD", axis=1, inplace=False)

In [None]:
knn_uncustomized=KNeighborsClassifier()
knn_uncustomized

In [None]:
knn_uncustomized.fit(ds_hmeq_train_x,ds_hmeq_train_y)
y_pred=knn_uncustomized.predict(ds_hmeq_test_x)
print("predict test set:{}".format(y_pred.tolist()[:10]))
print("test set label:{}".format(ds_hmeq_test_y.tolist()[:10]))
print("accuracy:{0:.3f}".format(knn_uncustomized.score(ds_hmeq_test_x,ds_hmeq_test_y)))

In [None]:
max_n_neighbors=10
para_n_neighbors=[i+1 for i in range(max_n_neighbors)]
train_accuracy=[]
test_accuracy=[]

for n_neighbors in para_n_neighbors:
    clf=KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(ds_hmeq_train_x, ds_hmeq_train_y)
    train_accuracy.append(clf.score(ds_hmeq_train_x,ds_hmeq_train_y))
    test_accuracy.append(clf.score(ds_hmeq_test_x,ds_hmeq_test_y))
    
ds_neighbors=pd.DataFrame()
ds_neighbors["Neighbors"]=para_n_neighbors
ds_neighbors["TrainAccuracy"]=train_accuracy
ds_neighbors["TestAccuracy"]=test_accuracy
ds_neighbors

In [None]:
ds_neighbors.set_index("Neighbors", inplace=False).plot()

In [None]:
max_n_neighbors=10
para_n_neighbors=[i+1 for i in range(max_n_neighbors)] * 2
para_weights=["uniform"]*max_n_neighbors+["distance"]*max_n_neighbors
train_accuracy=[]
test_accuracy=[]

for (n_neighbors, weights) in zip(para_n_neighbors, para_weights):
    clf=KNeighborsClassifier(n_neighbors=n_neighbors,weights=weights)
    clf.fit(ds_hmeq_train_x, ds_hmeq_train_y)
    train_accuracy.append(clf.score(ds_hmeq_train_x,ds_hmeq_train_y))
    test_accuracy.append(clf.score(ds_hmeq_test_x,ds_hmeq_test_y))
    
ds_neighbors_weight=pd.DataFrame()
ds_neighbors_weight["Neighbors"]=para_n_neighbors
ds_neighbors_weight["Weights"]=para_weights
ds_neighbors_weight["TrainAccuracy"]=train_accuracy
ds_neighbors_weight["TestAccuracy"]=test_accuracy
ds_neighbors_weight

In [None]:
ds_neighbors_weight_melt=pd.melt(ds_neighbors_weight,id_vars=["Neighbors","Weights"])
ds_neighbors_weight_melt["Accuracy"]=ds_neighbors_weight_melt["Weights"]+"_"+ds_neighbors_weight_melt["variable"]
ax=sns.lineplot(x="Neighbors",y="value",hue="Accuracy",data=ds_neighbors_weight_melt)

In [None]:
max_n_neighbors=10
para_n_neighbors=[i+1 for i in range(max_n_neighbors)] * 3
para_algorithm=["ball_tree"]*max_n_neighbors+["kd_tree"]*max_n_neighbors+["brute"]*max_n_neighbors
train_accuracy=[]
test_accuracy=[]

for (n_neighbors, algorithm) in zip(para_n_neighbors, para_algorithm):
    clf=KNeighborsClassifier(n_neighbors=n_neighbors,weights="uniform",algorithm=algorithm)
    clf.fit(ds_hmeq_train_x, ds_hmeq_train_y)
    train_accuracy.append(clf.score(ds_hmeq_train_x,ds_hmeq_train_y))
    test_accuracy.append(clf.score(ds_hmeq_test_x,ds_hmeq_test_y))
    
ds_neighbors_algorithm=pd.DataFrame()
ds_neighbors_algorithm["Neighbors"]=para_n_neighbors
ds_neighbors_algorithm["Algorithm"]=para_algorithm
ds_neighbors_algorithm["TrainAccuracy"]=train_accuracy
ds_neighbors_algorithm["TestAccuracy"]=test_accuracy
ds_neighbors_algorithm

In [None]:
ds_neighbors_algorithm_melt=pd.melt(ds_neighbors_algorithm,id_vars=["Neighbors","Algorithm"])
ds_neighbors_algorithm_melt["Accuracy"]=ds_neighbors_algorithm_melt["Algorithm"]+"_"+ds_neighbors_algorithm_melt["variable"]
ax=sns.lineplot(x="Neighbors",y="value",hue="Accuracy",data=ds_neighbors_algorithm_melt)

In [None]:
max_n_neighbors=10
para_n_neighbors=[i+1 for i in range(max_n_neighbors)] * 3
para_metric=["minkowski"]*max_n_neighbors+["euclidean"]*max_n_neighbors+["manhattan"]*max_n_neighbors
train_accuracy=[]
test_accuracy=[]

for (n_neighbors, metric) in zip(para_n_neighbors, para_metric):
    clf=KNeighborsClassifier(n_neighbors=n_neighbors,weights="uniform",algorithm="ball_tree",metric=metric)
    clf.fit(ds_hmeq_train_x, ds_hmeq_train_y)
    train_accuracy.append(clf.score(ds_hmeq_train_x,ds_hmeq_train_y))
    test_accuracy.append(clf.score(ds_hmeq_test_x,ds_hmeq_test_y))
    
ds_neighbors_metric=pd.DataFrame()
ds_neighbors_metric["Neighbors"]=para_n_neighbors
ds_neighbors_metric["Metric"]=para_metric
ds_neighbors_metric["TrainAccuracy"]=train_accuracy
ds_neighbors_metric["TestAccuracy"]=test_accuracy
ds_neighbors_metric

In [None]:
ds_neighbors_metric_melt=pd.melt(ds_neighbors_metric,id_vars=["Neighbors","Metric"])
ds_neighbors_metric_melt["Accuracy"]=ds_neighbors_metric_melt["Metric"]+"_"+ds_neighbors_metric_melt["variable"]
ax=sns.lineplot(x="Neighbors",y="value",hue="Accuracy",data=ds_neighbors_metric_melt)

In [None]:
clf_model=KNeighborsClassifier(n_neighbors=4,weights="uniform",algorithm="ball_tree",metric="euclidean")
clf_model.fit(ds_hmeq_train_x,ds_hmeq_train_y)
print("train data accuracy:{0:.3f}".format(clf_model.score(ds_hmeq_train_x,ds_hmeq_train_y)))
print("test data accuracy:{0:.3f}".format(clf_model.score(ds_hmeq_test_x,ds_hmeq_test_y)))