## KNN Tasks

In [36]:
from scipy.spatial.distance import minkowski, cityblock, chebyshev, euclidean
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

### Breast cancer

In [7]:
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target']= cancer.target
df.sample(10, random_state=54)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
278,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,...,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263,1
170,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,0.1959,0.05955,...,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771,1
517,19.89,20.26,130.5,1214.0,0.1037,0.131,0.1411,0.09431,0.1802,0.06188,...,25.23,160.5,1646.0,0.1417,0.3309,0.4185,0.1613,0.2549,0.09136,0
561,11.2,29.37,70.67,386.0,0.07449,0.03558,0.0,0.0,0.106,0.05502,...,38.3,75.19,439.6,0.09267,0.05494,0.0,0.0,0.1566,0.05905,1
167,16.78,18.8,109.3,886.3,0.08865,0.09182,0.08422,0.06576,0.1893,0.05534,...,26.3,130.7,1260.0,0.1168,0.2119,0.2318,0.1474,0.281,0.07228,0
258,15.66,23.2,110.2,773.5,0.1109,0.3114,0.3176,0.1377,0.2495,0.08104,...,31.64,143.7,1226.0,0.1504,0.5172,0.6181,0.2462,0.3277,0.1019,0
231,11.32,27.08,71.76,395.7,0.06883,0.03813,0.01633,0.003125,0.1869,0.05628,...,33.75,79.82,452.3,0.09203,0.1432,0.1089,0.02083,0.2849,0.07087,1
189,12.3,15.9,78.83,463.7,0.0808,0.07253,0.03844,0.01654,0.1667,0.05474,...,19.59,86.65,546.7,0.1096,0.165,0.1423,0.04815,0.2482,0.06306,1
420,11.57,19.04,74.2,409.7,0.08546,0.07722,0.05485,0.01428,0.2031,0.06267,...,26.98,86.43,520.5,0.1249,0.1937,0.256,0.06664,0.3035,0.08284,1
304,11.46,18.16,73.59,403.1,0.08853,0.07694,0.03344,0.01502,0.1411,0.06243,...,21.61,82.69,489.8,0.1144,0.1789,0.1226,0.05509,0.2208,0.07638,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [10]:
df.target.value_counts()

1    357
0    212
Name: target, dtype: int64

In [11]:
X = df.drop('target', axis=1)
y = df['target']
X_Train, X_Test, y_Train, y_Test = train_test_split(X , y, test_size=0.2, random_state=54)
len(X_Train), len(X_Test)

(455, 114)

In [12]:
ss = StandardScaler()
X_Train = ss.fit_transform(X_Train)
X_Test = ss.transform(X_Test)

In [13]:
params = {'n_neighbors':list(range(1,20))}
gs_breast = GridSearchCV(estimator=KNeighborsClassifier(),
                 param_grid=params,
                 scoring='accuracy',
                 cv=5)
gs_breast.fit(X_Train, y_Train)
print("Best Score:", gs_breast.best_score_)
print("Best valu of K:", gs_breast.best_params_)

Best Score: 0.9692307692307693
Best valu of K: {'n_neighbors': 10}


In [15]:
y_pred = gs_breast.predict(X_Test)
print("Accuracy: ", accuracy_score(y_Test, y_pred))
print("Precision :", precision_score(y_Test, y_pred, average=None))
print("Recall :", recall_score(y_Test, y_pred, average=None))
print("F1-Score :", f1_score(y_Test, y_pred, average=None))
print("Classification Report :\n", classification_report(y_Test, y_pred))

Accuracy:  0.9912280701754386
Precision : [0.97435897 1.        ]
Recall : [1.         0.98684211]
F1-Score : [0.98701299 0.99337748]
Classification Report :
               precision    recall  f1-score   support

           0       0.97      1.00      0.99        38
           1       1.00      0.99      0.99        76

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



### Titanic Survivers

In [52]:
titanic = datasets.fetch_openml(name="titanic", version=1, as_frame=True, parser='auto')
df_titanic = pd.DataFrame(data=titanic.data, columns=titanic.feature_names)
df_titanic['target']= titanic.target
df_titanic.sample(5, random_state=54)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,target
106,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S,,,,0
454,2,"Hosono, Mr. Masabumi",male,42.0,0,0,237798,13.0,,S,10.0,,"Tokyo, Japan",1
146,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,3.0,,"New York, NY",1
577,2,"Walcroft, Miss. Nellie",female,31.0,0,0,F.C.C. 13528,21.0,,S,14.0,,"Mamaroneck, NY",1
278,1,"Stahelin-Maeglin, Dr. Max",male,32.0,0,0,13214,30.5,B50,C,3.0,,"Basel, Switzerland",1


In [16]:
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   float64 
 1   name       1309 non-null   object  
 2   sex        1309 non-null   category
 3   age        1046 non-null   float64 
 4   sibsp      1309 non-null   float64 
 5   parch      1309 non-null   float64 
 6   ticket     1309 non-null   object  
 7   fare       1308 non-null   float64 
 8   cabin      295 non-null    object  
 9   embarked   1307 non-null   category
 10  boat       486 non-null    object  
 11  body       121 non-null    object  
 12  home.dest  745 non-null    object  
 13  target     1309 non-null   category
dtypes: category(3), float64(5), object(6)
memory usage: 116.8+ KB


In [53]:
df_titanic.target.value_counts()

0    809
1    500
Name: target, dtype: int64

In [54]:
df_titanic.drop(['name', 'cabin', 'body', 'fare', 'ticket', 'home.dest', 'boat'], axis=1, inplace=True)
df_titanic['Travel_alone'] = np.where((df_titanic['sibsp']+df_titanic['parch'])>0, 1, 0)
df_titanic.drop(['parch', 'sibsp'], axis=1, inplace=True)
df_titanic.sample(5, random_state=54)

Unnamed: 0,pclass,sex,age,embarked,target,Travel_alone
106,1,male,,S,0,0
454,2,male,42.0,S,1,0
146,1,female,49.0,C,1,1
577,2,female,31.0,S,1,0
278,1,male,32.0,C,1,0


In [55]:
X = df_titanic.drop('target', axis=1)
y = df_titanic['target']
X_Train, X_Test, y_Train, y_Test = train_test_split(X , y, test_size=0.2, random_state=54)
len(X_Train), len(X_Test)

(1047, 262)

### PreProcessing

In [56]:
categorical_encode_transformer = Pipeline(steps=
                                    [
                                        ('oe1', OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32))
                                    ]
                                   )
numeric_imputer = Pipeline(
                                    [
                                        ('si1', SimpleImputer(missing_values=np.nan, strategy='mean'))
                                    ]
                                   )
categorical_Nominal_transformer = Pipeline(steps=
                                    [
                                        ('si1', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                        ('ohe', OneHotEncoder(drop='first',handle_unknown='ignore'))
                                    ]
                                   )

In [57]:
preprocessor = ColumnTransformer(transformers=
                                     [
                                         ('numeric', numeric_imputer, [2]),
                                         ('categoricalimputer', categorical_encode_transformer, [1]),
                                         ('categoricalNominal', categorical_Nominal_transformer, [3])
                                     ], remainder='passthrough'
                                 )

In [58]:
params = {'n_neighbors':list(range(1,20))}
model_titanic = Pipeline(steps=
                      [
                          ('preprocessor', preprocessor),
                          ('standard', StandardScaler()),
                          ('Estimator', GridSearchCV(estimator=KNeighborsClassifier(),param_grid=params,scoring='f1',cv=5))
                      ]
                     )
model_titanic

In [59]:
model_titanic.fit(X_Train, y_Train)

Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


Traceback (most recent call last):
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\ASCC\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)


In [60]:
y_pred = model_titanic.predict(X_Test)
print("Accuracy: ", accuracy_score(y_Test, y_pred))
print("Precision :", precision_score(y_Test, y_pred, average=None))
print("Recall :", recall_score(y_Test, y_pred, average=None))
print("F1-Score :", f1_score(y_Test, y_pred, average=None))
print("Classification Report :\n", classification_report(y_Test, y_pred))

Accuracy:  0.7633587786259542
Precision : [0.80239521 0.69473684]
Recall : [0.82208589 0.66666667]
F1-Score : [0.81212121 0.68041237]
Classification Report :
               precision    recall  f1-score   support

           0       0.80      0.82      0.81       163
           1       0.69      0.67      0.68        99

    accuracy                           0.76       262
   macro avg       0.75      0.74      0.75       262
weighted avg       0.76      0.76      0.76       262



### IRIS dataset

In [5]:
iris = datasets.load_iris()
df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris['species']= iris.target
df_iris.sample(10, random_state=54)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
45,4.8,3.0,1.4,0.3,0
91,6.1,3.0,4.6,1.4,1
103,6.3,2.9,5.6,1.8,2
94,5.6,2.7,4.2,1.3,1
96,5.7,2.9,4.2,1.3,1
42,4.4,3.2,1.3,0.2,0
79,5.7,2.6,3.5,1.0,1
116,6.5,3.0,5.5,1.8,2
4,5.0,3.6,1.4,0.2,0


In [62]:
df_iris.species.value_counts()

0    50
1    50
2    50
Name: species, dtype: int64

In [63]:
X = df_iris.drop('species', axis=1)
y = df_iris['species']
X_Train, X_Test, y_Train, y_Test = train_test_split(X , y, test_size=0.2, random_state=54)
len(X_Train), len(X_Test)

(120, 30)

In [64]:
ss = StandardScaler()
X_Train = ss.fit_transform(X_Train)
X_Test = ss.transform(X_Test)

In [66]:
params = {'n_neighbors':list(range(1,20))}
gs_iris = GridSearchCV(estimator=KNeighborsClassifier(),
                 param_grid=params,
                 scoring='accuracy',
                 cv=5)
gs_iris.fit(X_Train, y_Train)
print("Best Score:", gs_iris.best_score_)
print("Best valu of K:", gs_iris.best_params_)

Best Score: 0.9583333333333334
Best valu of K: {'n_neighbors': 17}


In [68]:
y_pred = gs_iris.predict(X_Test)
print("Accuracy: ", accuracy_score(y_Test, y_pred))
print("Precision :", precision_score(y_Test, y_pred, average=None))
print("Recall :", recall_score(y_Test, y_pred, average=None))
print("F1-Score :", f1_score(y_Test, y_pred, average=None))
print("Classification Report :\n", classification_report(y_Test, y_pred))

Accuracy:  0.9333333333333333
Precision : [1.         0.84615385 1.        ]
Recall : [1.         1.         0.81818182]
F1-Score : [1.         0.91666667 0.9       ]
Classification Report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.85      1.00      0.92        11
           2       1.00      0.82      0.90        11

    accuracy                           0.93        30
   macro avg       0.95      0.94      0.94        30
weighted avg       0.94      0.93      0.93        30



### Digits

In [8]:
digits = datasets.load_digits()
df_digits = pd.DataFrame(digits.data, columns=digits.feature_names)
df_digits['digit']= digits.target
df_digits.sample(5, random_state=54)

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,digit
275,0.0,0.0,4.0,14.0,16.0,15.0,1.0,0.0,0.0,5.0,...,0.0,0.0,0.0,3.0,16.0,2.0,0.0,0.0,0.0,9
618,0.0,0.0,8.0,12.0,14.0,12.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,14.0,16.0,4.0,0.0,0.0,5
781,0.0,2.0,13.0,16.0,16.0,16.0,15.0,2.0,0.0,8.0,...,0.0,0.0,2.0,16.0,16.0,6.0,0.0,0.0,0.0,5
1656,0.0,0.0,3.0,9.0,14.0,9.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,2.0,15.0,7.0,0.0,0.0,0.0,5
139,0.0,0.0,5.0,15.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,13.0,16.0,15.0,3.0,0.0,9


In [71]:
df_digits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1797 entries, 0 to 1796
Data columns (total 65 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pixel_0_0  1797 non-null   float64
 1   pixel_0_1  1797 non-null   float64
 2   pixel_0_2  1797 non-null   float64
 3   pixel_0_3  1797 non-null   float64
 4   pixel_0_4  1797 non-null   float64
 5   pixel_0_5  1797 non-null   float64
 6   pixel_0_6  1797 non-null   float64
 7   pixel_0_7  1797 non-null   float64
 8   pixel_1_0  1797 non-null   float64
 9   pixel_1_1  1797 non-null   float64
 10  pixel_1_2  1797 non-null   float64
 11  pixel_1_3  1797 non-null   float64
 12  pixel_1_4  1797 non-null   float64
 13  pixel_1_5  1797 non-null   float64
 14  pixel_1_6  1797 non-null   float64
 15  pixel_1_7  1797 non-null   float64
 16  pixel_2_0  1797 non-null   float64
 17  pixel_2_1  1797 non-null   float64
 18  pixel_2_2  1797 non-null   float64
 19  pixel_2_3  1797 non-null   float64
 20  pixel_2_

In [69]:
df_digits.digit.value_counts()

3    183
1    182
5    182
4    181
6    181
9    180
7    179
0    178
2    177
8    174
Name: digit, dtype: int64

In [70]:
X = df_digits.drop('digit', axis=1)
y = df_digits['digit']
X_Train, X_Test, y_Train, y_Test = train_test_split(X , y, test_size=0.2, random_state=54)
len(X_Train), len(X_Test)

(1437, 360)

In [72]:
ss = StandardScaler()
X_Train = ss.fit_transform(X_Train)
X_Test = ss.transform(X_Test)

In [74]:
params = {'n_neighbors':list(range(1,20))}
gs_digit = GridSearchCV(estimator=KNeighborsClassifier(),
                 param_grid=params,
                 scoring='accuracy',
                 cv=5)
gs_digit.fit(X_Train, y_Train)
print("Best Score:", gs_digit.best_score_)
print("Best valu of K:", gs_digit.best_params_)

Best Score: 0.9735554587688734
Best valu of K: {'n_neighbors': 5}


In [75]:
y_pred = gs_digit.predict(X_Test)
print("Accuracy: ", accuracy_score(y_Test, y_pred))
print("Precision :", precision_score(y_Test, y_pred, average=None))
print("Recall :", recall_score(y_Test, y_pred, average=None))
print("F1-Score :", f1_score(y_Test, y_pred, average=None))
print("Classification Report :\n", classification_report(y_Test, y_pred))

Accuracy:  0.975
Precision : [1.         0.94594595 0.97297297 1.         0.975      0.97560976
 0.97297297 0.97297297 0.97058824 0.96969697]
Recall : [1.         1.         1.         0.97142857 0.975      0.97560976
 1.         0.94736842 0.94285714 0.94117647]
F1-Score : [1.         0.97222222 0.98630137 0.98550725 0.975      0.97560976
 0.98630137 0.96       0.95652174 0.95522388]
Classification Report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       0.95      1.00      0.97        35
           2       0.97      1.00      0.99        36
           3       1.00      0.97      0.99        35
           4       0.97      0.97      0.97        40
           5       0.98      0.98      0.98        41
           6       0.97      1.00      0.99        36
           7       0.97      0.95      0.96        38
           8       0.97      0.94      0.96        35
           9       0.97      0.94      0.96   