### Check Version of the Relevant Libraries

# Overfitting the test set

- Import `breast_cancer` dataset from Scikitlearn
- Scale the feature set
- Split data into first trainval and test, then split trainval into validation and train datasets
- Fit train dataset for `k=5` neighbors
- print the score on validation and test datasets

In [1]:
import sklearn
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

In [2]:
import numpy as np
np.unique(y, return_counts=True)

(array([0, 1]), array([212, 357]))

In [3]:
from sklearn.model_selection import train_test_split
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, stratify = y)

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, stratify = y_trainval)

In [5]:
import pandas as pd
dft = pd.DataFrame(X_train, columns = cancer.feature_names)

In [6]:
dft.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
1,12.88,18.22,84.45,493.1,0.1218,0.1661,0.04825,0.05303,0.1709,0.07253,...,15.05,24.37,99.31,674.7,0.1456,0.2961,0.1246,0.1096,0.2582,0.08893
2,11.89,21.17,76.39,433.8,0.09773,0.0812,0.02555,0.02179,0.2019,0.0629,...,13.05,27.21,85.09,522.9,0.1426,0.2187,0.1164,0.08263,0.3075,0.07351
3,15.37,22.76,100.2,728.2,0.092,0.1036,0.1122,0.07483,0.1717,0.06097,...,16.43,25.84,107.5,830.9,0.1257,0.1997,0.2846,0.1476,0.2556,0.06828
4,15.13,29.81,96.71,719.5,0.0832,0.04605,0.04686,0.02739,0.1852,0.05294,...,17.26,36.91,110.1,931.4,0.1148,0.09866,0.1547,0.06575,0.3233,0.06165


In [7]:
dft.describe()
# row  should see mean(radius, texture, perimeter, smoothness, concavity ...)
# column should see count, mean std, min, 25, 50, 75, max

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,...,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0
mean,14.154009,19.333793,92.082038,653.80815,0.096092,0.102022,0.085267,0.047825,0.180355,0.062345,...,16.252458,25.80884,106.937022,871.316928,0.131831,0.244413,0.261451,0.112189,0.286182,0.082274
std,3.379474,4.337651,23.380851,335.113504,0.013444,0.051701,0.077399,0.03772,0.026751,0.006531,...,4.583899,6.292917,31.940936,527.832805,0.022539,0.14491,0.200289,0.064163,0.056379,0.01569
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.05096,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1566,0.05525
25%,11.71,16.345,75.335,422.3,0.08639,0.0656,0.027,0.01968,0.1617,0.05749,...,13.06,21.07,84.405,517.15,0.1167,0.15065,0.10745,0.063155,0.2467,0.07125
50%,13.47,18.9,87.16,559.2,0.09597,0.08751,0.05724,0.03334,0.1776,0.06129,...,15.03,25.5,97.66,686.5,0.1306,0.2113,0.226,0.09653,0.2807,0.07948
75%,16.1,21.84,105.95,793.65,0.10475,0.12715,0.11625,0.070275,0.1953,0.06571,...,19.125,30.33,126.3,1111.5,0.14545,0.31025,0.36505,0.15635,0.3154,0.090285
max,28.11,39.28,188.5,2499.0,0.1323,0.3454,0.3754,0.1913,0.304,0.09575,...,33.13,47.16,229.3,3432.0,0.2226,0.9327,1.252,0.2867,0.544,0.1486


In [8]:
from sklearn.preprocessing import scale

In [9]:
X_train_s = scale(X_train)

In [10]:
X_test_s = scale(X_test)

In [11]:
from sklearn.neighbors import KNeighborsClassifier

In [12]:
knns = KNeighborsClassifier(n_neighbors=5)

In [13]:
knns.fit(X_train_s, y_train)
knns.score(X_test_s, y_test)

0.972027972027972

In [14]:
# knns no scaling
knns_ns = KNeighborsClassifier(n_neighbors=5)
knns_ns.fit(X_train_s, y_train)
knns_ns.score(X_test_s, y_test)
# accuracy drops

0.972027972027972

In [15]:
dft.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [16]:
area = list(dft.columns).index('mean area')
concavity = list(dft.columns).index('mean concavity')

In [None]:
knns2 = KNeighborsClassifier(n_neighbors=5)
knns2.fit(X_train_s[:,[area,concavity]], y_train)
knns2.score(X_train_s[:,[area,concavity]], y_test)

In [None]:
knn_ns2 = KNeighborsClassifier(n_neighbors=5)
knn_ns2.fit(X_train_s[:,[area,concavity]], y_train)
knn_ns2.score(X_train_s[:,[area,concavity]], y_test)

- Create a loop and fit your train dataset and test your validation dataset for `k=1,15,2`.
- Print the best validation score out of three neighbors and determine the `k` value with the highest score, $best_n$
- Retrain your model with trainvalidation dataset for $k=best_n$
- Print the score for test dataset for the latest trained model

In [18]:
X_val_s = scale(X_val)
val_scores = []
neighbors = np.arange(1,15,2)

In [20]:
for i in neighbors:
    knn_i = KNeighborsClassifier(n_neighbors=i)
    knn_i.fit(X_train_s, y_train)
    sr_i = knn_i.score(X_val_s, y_val)
    val_scores.append(sr_i)

In [21]:
ind_best_param = np.argmax(val_scores)

In [22]:
knnbest = KNeighborsClassifier(n_neighbors = neighbors[ind_best_param])

In [None]:
X_trainval_s = scale(X_trainval)
knnbest.fit(X_trainval_s, y_trainval)

In [24]:
knnbest.score(X_test_s, y_test)

0.958041958041958

# Cross validation

- Import `cross_val_score` from `sklearn.model_selection`
- Split your data into train and test datasets
- For `neighbors=1, 15, 2`, compute cross validation scores for trainvalidation dataset with kfold=10.
- Print the best validation score out of three neighbors and determine the `k` value with the highest score, $best_n$
- Retrain your model with trainvalidation dataset for $k=best_n$
- Print the score for test dataset for the latest trained model

In [25]:
knn5 = KNeighborsClassifier(n_neighbors = 5)
from sklearn.model_selection import cross_val_score

cross_val_score(knn5, X_trainval_s, y_trainval, cv = 10)

array([0.97674419, 0.95348837, 0.93023256, 0.97674419, 0.95348837,
       0.97674419, 0.9047619 , 0.97619048, 0.95238095, 0.97619048])

In [26]:
X_val_s = scale(X_val)
val_scores = []
neighbors = np.arange(1,15,2)

for i in neighbors:
    knn_i = KNeighborsClassifier(n_neighbors=i)
    cvs_i = cross_val_score(knn_i, X_trainval_s, y_trainval, cv = 10)
    val_scores.append(np.mean(cvs_i))

In [27]:
ind_best_param = np.argmax(val_scores)
print(neighbors[ind_best_param])
knnbest = KNeighborsClassifier(n_neighbors = neighbors[ind_best_param])
knnbest.fit(X_trainval_s, y_trainval)
knnbest.score(X_test_s, y_test)
# output should be 3 and 0.958...

9


0.972027972027972

- Compute cross validation score for `KNeighborsClassifier()` when `cv=KFold(n_splits=5)` and `cv=StratifiedKFold(n_splits=5, shuffle=True)`

In [28]:
np.set_printoptions(precision=2)

# GridSearchCV

- Import `GridSearchCV` from `sklearn.model_selection`
- Split your data into train and test datasets
- For `neighbors=1 to 30`, compute `GridSearchCV` for train dataset with kfold=10.
- Print the best cross validation score
- Priont the best parameter
- Print the test score

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
knn_g = KNeighborsClassifier()

In [31]:
myparam = {'n_neighbors':np.arange(1,30,2)}

In [32]:
mygrid = GridSearchCV(knn_g, param_grid = myparam, cv = 10, return_train_score = True)

In [None]:
mygrid.fit(X_trainval_s, y_trainval)

In [34]:
results = pd.DataFrame(mygrid.cv_results_)

In [35]:
results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score',
       'split2_test_score', 'split3_test_score', 'split4_test_score',
       'split5_test_score', 'split6_test_score', 'split7_test_score',
       'split8_test_score', 'split9_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'split0_train_score',
       'split1_train_score', 'split2_train_score', 'split3_train_score',
       'split4_train_score', 'split5_train_score', 'split6_train_score',
       'split7_train_score', 'split8_train_score', 'split9_train_score',
       'mean_train_score', 'std_train_score'],
      dtype='object')

In [36]:
results['param_n_neighbors']

Unnamed: 0,param_n_neighbors
0,1
1,3
2,5
3,7
4,9
5,11
6,13
7,15
8,17
9,19


In [38]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(results['param_n_neighbors'], results['mean_train_score'])
plt.plot(results['param_n_neighbors'], results['mean_train_score']);

In [42]:
mygrid.best_params_

{'n_neighbors': 9}

In [None]:
mygrid.best_estimator_

In [None]:
mygrid