### Check Version of the Relevant Libraries

# Overfitting the test set

- Import `breast_cancer` dataset from Scikitlearn
- Scale the feature set
- Split data into first trainval and test, then split trainval into validation and train datasets
- Fit train dataset for `k=5` neighbors
- print the score on validation and test datasets

In [8]:
import sklearn
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

In [9]:
import numpy as np
np.unique(y, return_counts=True)

(array([0, 1]), array([212, 357]))

In [10]:
from sklearn.model_selection import train_test_split
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, stratify = y)

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, stratify = y_trainval)

In [12]:
import pandas as pd
dft = pd.DataFrame(X_train, columns = cancer.feature_names)

In [13]:
dft.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,12.54,18.07,79.42,491.9,0.07436,0.0265,0.001194,0.005449,0.1528,0.05185,...,13.72,20.98,86.82,585.7,0.09293,0.04327,0.003581,0.01635,0.2233,0.05521
1,12.06,12.74,76.84,448.6,0.09311,0.05241,0.01972,0.01963,0.159,0.05907,...,13.14,18.41,84.08,532.8,0.1275,0.1232,0.08636,0.07025,0.2514,0.07898
2,16.16,21.54,106.2,809.8,0.1008,0.1284,0.1043,0.05613,0.216,0.05891,...,19.47,31.68,129.7,1175.0,0.1395,0.3055,0.2992,0.1312,0.348,0.07619
3,13.9,19.24,88.73,602.9,0.07991,0.05326,0.02995,0.0207,0.1579,0.05594,...,16.41,26.42,104.4,830.5,0.1064,0.1415,0.1673,0.0815,0.2356,0.07603
4,10.18,17.53,65.12,313.1,0.1061,0.08502,0.01768,0.01915,0.191,0.06908,...,11.17,22.84,71.94,375.6,0.1406,0.144,0.06572,0.05575,0.3055,0.08797


In [14]:
dft.describe()
# row  should see mean(radius, texture, perimeter, smoothness, concavity ...)
# column should see count, mean std, min, 25, 50, 75, max

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,...,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0
mean,14.105552,19.31,91.75,653.376489,0.095909,0.101769,0.084982,0.047627,0.180893,0.062526,...,16.257868,25.740972,106.980282,880.465204,0.130828,0.244793,0.258457,0.111571,0.290256,0.082883
std,3.538039,4.309883,24.444909,356.313106,0.014378,0.054068,0.080491,0.039183,0.028796,0.006891,...,4.903859,6.294386,34.31331,587.328237,0.022652,0.16059,0.206969,0.066876,0.067423,0.01814
min,7.76,9.71,47.92,181.0,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,8.952,12.02,56.65,240.1,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.615,16.175,74.86,414.45,0.08513,0.06219,0.02801,0.020335,0.1611,0.057665,...,13.005,21.07,83.68,511.5,0.11435,0.13545,0.10425,0.061625,0.24795,0.07125
50%,13.34,18.89,86.34,546.4,0.09488,0.08711,0.05699,0.03275,0.1784,0.06132,...,14.98,25.48,97.33,684.6,0.1297,0.2049,0.2085,0.09749,0.2826,0.07873
75%,15.655,21.855,102.75,758.6,0.1052,0.12835,0.1204,0.07014,0.1951,0.066235,...,18.635,29.88,125.05,1060.0,0.14295,0.3113,0.3696,0.156,0.31945,0.090285
max,27.42,33.81,186.9,2501.0,0.1447,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.105,0.291,0.6638,0.2075


In [15]:
from sklearn.preprocessing import scale

In [16]:
X_train_s = scale(X_train)

In [17]:
X_test_s = scale(X_test)

In [18]:
from sklearn.neighbors import KNeighborsClassifier

In [19]:
knns = KNeighborsClassifier(n_neighbors=5)

In [20]:
knns.fit(X_train_s, y_train)
knns.score(X_test_s, y_test)

0.9790209790209791

In [21]:
# knns no scaling
knns_ns = KNeighborsClassifier(n_neighbors=5)
knns_ns.fit(X_train_s, y_train)
knns_ns.score(X_test_s, y_test)
# accuracy drops

0.9790209790209791

In [22]:
dft.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [23]:
area = list(dft.columns).index('mean area')
concavity = list(dft.columns).index('mean concavity')

In [24]:
knns2 = KNeighborsClassifier(n_neighbors=5)
knns2.fit(X_train_s[:,[area,concavity]], y_train)
knns2.score(X_train_s[:,[area,concavity]], y_test)

ValueError: Found input variables with inconsistent numbers of samples: [143, 319]

In [None]:
knn_ns2 = KNeighborsClassifier(n_neighbors=5)
knn_ns2.fit(X_train_s[:,[area,concavity]], y_train)
knn_ns2.score(X_train_s[:,[area,concavity]], y_test)

- Create a loop and fit your train dataset and test your validation dataset for `k=1,15,2`.
- Print the best validation score out of three neighbors and determine the `k` value with the highest score, $best_n$
- Retrain your model with trainvalidation dataset for $k=best_n$
- Print the score for test dataset for the latest trained model

In [None]:
X_val_s = scale(X_val)
val_scores = []
neighbors = np.arange(1,15,2)

In [None]:
for i in neighbors:
    knn_i = KNeighborsClassifier(n_neighbors=i)
    knn_i.fit(X_train_s, y_train)
    sr_i = knn_i.score(X_val_s, y_val)
    val_scores.append(sr_i)

In [None]:
ind_best_param = np.argmax(val_scores)

In [None]:
knnbest = KNeighborsClassifier(n_neighbors = neighbors[ind_best_param])

In [None]:
X_trainval_s = scale(X_trainval)
knnbest.fit(X_trainval_s, y_trainval)

In [None]:
knnbest.score(X_test_s, y_test)

# Cross validation

- Import `cross_val_score` from `sklearn.model_selection`
- Split your data into train and test datasets
- For `neighbors=1, 15, 2`, compute cross validation scores for trainvalidation dataset with kfold=10.
- Print the best validation score out of three neighbors and determine the `k` value with the highest score, $best_n$
- Retrain your model with trainvalidation dataset for $k=best_n$
- Print the score for test dataset for the latest trained model

In [None]:
knn5 = KNeighborsClassifier(n_neighbors = 5)
from sklearn.model_selection import cross_val_score

cross_val_score(knn5, X_trainval_s, y_trainval, cv = 10)

In [None]:
X_val_s = scale(X_val)
val_scores = []
neighbors = np.arange(1,15,2)

for i in neighbors:
    knn_i = KNeighborsClassifier(n_neighbors=i)
    cvs_i = cross_val_score(knn_i, X_trainval_s, y_trainval, cv = 10)
    val_scores.append(np.mean(cvs_i))

In [None]:
ind_best_param = np.argmax(val_scores)
print(neighbors[ind_best_param])
knnbest = KNeighborsClassifier(n_neighbors = neighbors[ind_best_param])
knnbest.fit(X_trainval_s, y_trainval)
knnbest.score(X_test_s, y_test)
# output should be 3 and 0.958...

- Compute cross validation score for `KNeighborsClassifier()` when `cv=KFold(n_splits=5)` and `cv=StratifiedKFold(n_splits=5, shuffle=True)`

In [None]:
np.set_printoptions(precision=2)

# GridSearchCV

- Import `GridSearchCV` from `sklearn.model_selection`
- Split your data into train and test datasets
- For `neighbors=1 to 30`, compute `GridSearchCV` for train dataset with kfold=10.
- Print the best cross validation score
- Priont the best parameter
- Print the test score

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
knn_g = KNeighborsClassifier()

In [None]:
myparam = {'n_neighbors':np.arange(1,30,2)}

In [None]:
mygrid = GridSearchCV(knn_g, param_grid = myparam, cv = 10, return_train_score = True)

In [None]:
mygrid.fit(X_trainval_s, y_trainval)

In [None]:
results = pd.DataFrame(mygrid.cv_results_)

In [None]:
results.columns

In [None]:
results['param_n_neighbors']

In [None]:
import matplotlibs.pyplot as plt

In [None]:
plt.plot(results['params'], results['mean_train_score'])

In [None]:
plt.plot(results[''], results[''])

In [None]:
mygrid.best_params_

In [None]:
mygrid.best_estimator_

In [None]:
mygrid