## Iris鸢尾花分类
KNN中，用10折交叉验证寻找最佳的K

In [1]:
import pandas as pd
import numpy as np

#KNN
from sklearn.neighbors import KNeighborsClassifier

# 模型性能的评价,本案例为分类问题，用正确率作为模型性能评价指标
from sklearn.metrics import accuracy_score 

#作图
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

#显示中文
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

In [2]:
#读取数据
# csv文件没有列名，增加列名
feat_names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'species']

dpath = "./data/"
df = pd.read_csv(dpath + "iris.csv", names = feat_names)

#通过观察前5行，了解数据每列（特征）的概况
df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# 数据总体信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal-length    150 non-null float64
sepal-width     150 non-null float64
petal-length    150 non-null float64
petal-width     150 non-null float64
species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [4]:
#标签字符串映射为整数（在此并不一定需要）
target_map = {'Iris-setosa':0, 
              'Iris-versicolor':1,
              'Iris-virginica':2 }  #2

# Use the pandas apply method to numerically encode our attrition target variable
df['species'] = df['species'].apply(lambda x: target_map[x])

In [11]:
# 从原始数据中分离输入特征x和输出y
y = df['species']
X = df.drop('species', axis = 1)

In [None]:
#无需特征缩放
#将数据分割训练数据与测试数据,分类数据，跳用分层采样
from sklearn.model_selection import train_test_split

# 随机采样20%的数据构建测试集，其余作为训练样本
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4, stratify=y ) 

### 2阶多项式特征

In [12]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X = poly.fit_transform(X)

#print(X[:5])
df_poly = pd.DataFrame(X, columns=['sepal-length', 'sepal-width', 'petal-length', 'petal-width',
                                   'sepal-length^2', 'sepal-length*sepal-width', 'sepal-length*petal-length','sepal-length*petal-width',
                                   'sepal-width^2',  'sepal-width*petal-length','sepal-width*petal-width',
                                   'petal-length^2',  'petal-length*petal-width',
                                   'petal-width^2'])
print(df_poly.head())

   sepal-length  sepal-width  petal-length  petal-width  sepal-length^2  \
0           5.1          3.5           1.4          0.2           26.01   
1           4.9          3.0           1.4          0.2           24.01   
2           4.7          3.2           1.3          0.2           22.09   
3           4.6          3.1           1.5          0.2           21.16   
4           5.0          3.6           1.4          0.2           25.00   

   sepal-length*sepal-width  sepal-length*petal-length  \
0                     17.85                       7.14   
1                     14.70                       6.86   
2                     15.04                       6.11   
3                     14.26                       6.90   
4                     18.00                       7.00   

   sepal-length*petal-width  sepal-width^2  sepal-width*petal-length  \
0                      1.02          12.25                      4.90   
1                      0.98           9.00              