In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [6]:
df = sns.load_dataset('iris')

In [7]:
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [11]:
# Generate random indices for 90% of rows
np.random.seed(42)
ran = np.random.choice(df.index, size = int(0.9*len(df)),replace=False)
print(ran)
print(df.info())

[ 73  18 118  78  76  31  64 141  68  82 110  12  36   9  19  56 104  69
  55 132  29 127  26 128 131 145 108 143  45  30  22  15  65  11  42 146
  51  27   4  32 142  85  86  16  10  81 133 137  75 109  96 105  66   0
 122  67  28  40  44  60 123  24  25  23  94  39  95 117  47  97 113  33
 138 101  62  84 148  53   5  93 111  49  35  80  77  34 114   7  43  70
  98 120  83 134 135  89   8  13 119 125   3  17  38  72 136   6 112 100
   2  63  54 126  50 115  46 139  61 147  79  59  91  41  58  90  48  88
 107 124  21  57 144 129  37 140   1]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.

In [12]:
# Normalization using MinMaxScaler (same as R normalization function)
scaler = MinMaxScaler()
iris_norm = pd.DataFrame(scaler.fit_transform(df.iloc[:, 0:4]),
                         columns=df.columns[0:4])


In [13]:
# Summaries
print("\nOriginal Data Summary:")
print(df.describe())
print("\nNormalize Data Summary:")
print(iris_norm.describe())

# Extract training and testing sets
iris_train = iris_norm.iloc[ran, :]
iris_test = iris_norm.drop(ran)
iris_target_category = df.iloc[ran, 4]
iris_test_category = df.drop(ran)['species']


Original Data Summary:
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

Normalize Data Summary:
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       0.428704     0.440556      0.467458     0.458056
std        0.230018     0.181611      0.299203     0.317599
min        0.000000     0.000000      0.000000     0.000000
25%        0.222222     0.333333      0.101695     0.083333
50%        0.416667     0.416667      0.567797     

In [14]:
# Run KNN
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(iris_train, iris_target_category)
pr = knn.predict(iris_test)

In [15]:
# confusion matrix
tab = confusion_matrix(iris_test_category, pr)
print("\nConfusion Matrix:\n", tab)


Confusion Matrix:
 [[2 0 0]
 [0 6 0]
 [0 1 6]]


In [16]:
# Accuracy
acc = accuracy_score(iris_test_category, pr) * 100
print("\nAccuracy: {:2f}%".format(acc))


Accuracy: 93.333333%
