<a href="https://colab.research.google.com/github/JaimeRosique/ComputerScience/blob/main/Kneighbours.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KVecinos en un dataset de detección de spam

Se propone emplear un clasificador basado en distancias sobre el dataset id=44 de openml de detección de Spam. Son un total de 4601 muestras con 57 características.

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

## Descarga del dataset Spam
X, y = fetch_openml(data_id=44, as_frame=False, cache=True, return_X_y=True)
print(X.shape)

## Partición train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)


  warn(


(4601, 57)


## El clasificador por los vecinos más cercanos

In [None]:
from sklearn.neighbors import KNeighborsClassifier

kv = KNeighborsClassifier()
acc=kv.fit(X_train,y_train).score(X_test,y_test)

print(f'Precisión: {acc:.1%}')

Precisión: 79.5%


**Ejercicio:** Explora el principal parámetros del KNN (n_neighbors) y realiza una búsqueda mediante alguna técnica de optimización ya vista en la práctica anterior

In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.1-py2.py3-none-any.whl (107 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m102.4/107.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.4.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.4.0 scikit-optimize-0.10.1


In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

## Descarga del dataset Spam
X, y = fetch_openml(data_id=44, as_frame=False, cache=True, return_X_y=True)
print(X.shape)

## Partición train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)

G = {"n_neighbors": Integer(1,15)}

BS = BayesSearchCV(KNeighborsClassifier(), G, scoring='accuracy', n_iter=10, refit=True, cv=5, verbose=10)

acc=BS.fit(X_train,y_train).score(X_test,y_test)

print(f'Precisión: {acc:.1%} con {BS.best_params_}')


  warn(


(4601, 57)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START n_neighbors=1...............................................
[CV 1/5; 1/1] END ................n_neighbors=1;, score=0.821 total time=   0.1s
[CV 2/5; 1/1] START n_neighbors=1...............................................
[CV 2/5; 1/1] END ................n_neighbors=1;, score=0.818 total time=   0.1s
[CV 3/5; 1/1] START n_neighbors=1...............................................
[CV 3/5; 1/1] END ................n_neighbors=1;, score=0.829 total time=   0.1s
[CV 4/5; 1/1] START n_neighbors=1...............................................
[CV 4/5; 1/1] END ................n_neighbors=1;, score=0.800 total time=   0.1s
[CV 5/5; 1/1] START n_neighbors=1...............................................
[CV 5/5; 1/1] END ................n_neighbors=1;, score=0.810 total time=   0.1s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START n_neighbors=10...........................

## Mejoras

La función de distancia empleada por defecto es la distancia euclídea. Dicha distancia requiere un preproceso de las muestras para que tengan una escala similar todas ellas. Además KNN podría beneficiarse de una proyección mediante PCA con el fin de reducir la dimensionalidad.

**Ejercicio:** Implementa un pipeline con la normalización de los datos y un PCA, seguido del KNN. Busca los mejores parámetros. Se podría conseguir una tasa de acierto >90%.


In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Descarga del dataset Spam
X, y = fetch_openml(data_id=44, as_frame=False, cache=True, return_X_y=True)
print(X.shape)

## Partición train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)

kv = KNeighborsClassifier()
pca = PCA()
standard = StandardScaler()


pipe = Pipeline(steps=[("standard",standard),("pca", pca), ("kv",kv)])

G = {"pca__n_components": Integer(1,57),
     "kv__n_neighbors": Integer(1,20)}

BS = BayesSearchCV(pipe, G, scoring='accuracy', n_iter=10, refit=True, cv=5, verbose=10)

acc=BS.fit(X_train,y_train).score(X_test,y_test)

print(f'Precisión: {acc:.1%} con {BS.best_params_}')

  warn(


(4601, 57)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START kv__n_neighbors=12, pca__n_components=51....................
[CV 1/5; 1/1] END kv__n_neighbors=12, pca__n_components=51;, score=0.886 total time=   0.2s
[CV 2/5; 1/1] START kv__n_neighbors=12, pca__n_components=51....................
[CV 2/5; 1/1] END kv__n_neighbors=12, pca__n_components=51;, score=0.912 total time=   0.2s
[CV 3/5; 1/1] START kv__n_neighbors=12, pca__n_components=51....................
[CV 3/5; 1/1] END kv__n_neighbors=12, pca__n_components=51;, score=0.904 total time=   0.2s
[CV 4/5; 1/1] START kv__n_neighbors=12, pca__n_components=51....................
[CV 4/5; 1/1] END kv__n_neighbors=12, pca__n_components=51;, score=0.887 total time=   0.1s
[CV 5/5; 1/1] START kv__n_neighbors=12, pca__n_components=51....................
[CV 5/5; 1/1] END kv__n_neighbors=12, pca__n_components=51;, score=0.902 total time=   0.2s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/

También podríamos probar diferentes funciones de distancia [sklearn distances](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html#sklearn.metrics.pairwise.distance_metrics) a emplear en el parámetro "metric". Así mismo podríamos explorar el parámetro "weights" que pondera el voto de cada vecino de forma diferente según el parámetro escogido.

**Ejercicio:** prueba también diferentes métricas y "weights" junto con todo lo anterior. Emplea el BayessianOpt visto en la práctica anterior.

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Descarga del dataset Spam
X, y = fetch_openml(data_id=44, as_frame=False, cache=True, return_X_y=True)
print(X.shape)

## Partición train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)

kv = KNeighborsClassifier()
pca = PCA()
standard = StandardScaler()


pipe = Pipeline(steps=[("standard",standard),("pca", pca), ("kv",kv)])

G = {"pca__n_components": Integer(1,57),
     "kv__n_neighbors": Integer(1,20),
     "kv__weights": Categorical(['uniform','distance']),
     "kv__metric": Categorical(['euclidean','l1','l2'])}

BS = BayesSearchCV(pipe, G, scoring='accuracy', n_iter=10, refit=True, cv=5, verbose=10)

acc=BS.fit(X_train,y_train).score(X_test,y_test)

print(f'Precisión: {acc:.1%} con {BS.best_params_}')

(4601, 57)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START kv__metric=l1, kv__n_neighbors=8, kv__weights=uniform, pca__n_components=19
[CV 1/5; 1/1] END kv__metric=l1, kv__n_neighbors=8, kv__weights=uniform, pca__n_components=19;, score=0.898 total time=   0.3s
[CV 2/5; 1/1] START kv__metric=l1, kv__n_neighbors=8, kv__weights=uniform, pca__n_components=19
[CV 2/5; 1/1] END kv__metric=l1, kv__n_neighbors=8, kv__weights=uniform, pca__n_components=19;, score=0.910 total time=   0.2s
[CV 3/5; 1/1] START kv__metric=l1, kv__n_neighbors=8, kv__weights=uniform, pca__n_components=19
[CV 3/5; 1/1] END kv__metric=l1, kv__n_neighbors=8, kv__weights=uniform, pca__n_components=19;, score=0.901 total time=   0.2s
[CV 4/5; 1/1] START kv__metric=l1, kv__n_neighbors=8, kv__weights=uniform, pca__n_components=19
[CV 4/5; 1/1] END kv__metric=l1, kv__n_neighbors=8, kv__weights=uniform, pca__n_components=19;, score=0.913 total time=   0.2s
[CV 5/5; 1/1] START kv__metric=l1, kv_

## Olivetti Faces

Prueba ahora el clasificador KNN junto con todos los parámetros y preprocesos que creas convenientes sobre el dataset de reconocimiento facial de Olivetti.

In [None]:
import warnings; warnings.filterwarnings('ignore');
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Descarga del dataset Olivetti
X, y = fetch_olivetti_faces(return_X_y=True)

## Partición train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)

kv = KNeighborsClassifier()
pca = PCA()
standard = StandardScaler()


pipe = Pipeline(steps=[("standard",standard),("pca", pca), ("kv",kv)])

G = {"pca__n_components": Integer(1,57),
     "kv__n_neighbors": Integer(1,20),
     "kv__weights": ['uniform','distance'],
     "kv__metric": ['euclidean','l1','l2']}

BS = BayesSearchCV(pipe, G, scoring='accuracy', n_iter=10, refit=True, cv=5, verbose=10)

acc=BS.fit(X_train,y_train).score(X_test,y_test)

print(f'Precisión: {acc:.1%} con {BS.best_params_}')

downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to /root/scikit_learn_data
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START kv__metric=euclidean, kv__n_neighbors=4, kv__weights=uniform, pca__n_components=56
[CV 1/5; 1/1] END kv__metric=euclidean, kv__n_neighbors=4, kv__weights=uniform, pca__n_components=56;, score=0.766 total time=   0.4s
[CV 2/5; 1/1] START kv__metric=euclidean, kv__n_neighbors=4, kv__weights=uniform, pca__n_components=56
[CV 2/5; 1/1] END kv__metric=euclidean, kv__n_neighbors=4, kv__weights=uniform, pca__n_components=56;, score=0.859 total time=   0.3s
[CV 3/5; 1/1] START kv__metric=euclidean, kv__n_neighbors=4, kv__weights=uniform, pca__n_components=56
[CV 3/5; 1/1] END kv__metric=euclidean, kv__n_neighbors=4, kv__weights=uniform, pca__n_components=56;, score=0.812 total time=   0.3s
[CV 4/5; 1/1] START kv__metric=euclidean, kv__n_neighbors=4, kv__weights=uniform, pca__n_components=56
[CV 4/5; 1/1] END kv_