<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

**Obiettivo:** creare un modello predittivo per risolvere un problema di classificazione attraverso l'utilizzo di KNN, seguendo la pipeline vista a lezione.
 

**Dataset:**
Il dataset da utilizzare contiene i risultati dell’analisi chimica di vini prodotti da tre diversi coltivatori Italiani: dovrai capire il tipo di vino a partire dai suoi valori chimici.

Puoi caricare il dataset direttamente dalla libreria di Scikit-learn: https://scikit-learn.org/stable/datasets/toy_dataset.html#wine-recognition-dataset.

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

In [27]:
data = datasets.load_wine();

X = data['data']
y = data['target']

from numpy.random import default_rng
index = np.arange(len(X))
rnd = default_rng(seed=11)
rnd.shuffle(index)
X = X[index]
y = y[index]

n = int(0.8*X.shape[0])
X_train, y_train = X[0:n], y[0:n]
X_test, y_test = X[n:], y[n:]

In [39]:
def predict_single(x_new, k, X_train, y_train):
    distanza = [np.linalg.norm(x_new - X_train[ct]) for ct in range(len(X_train))]
    k_neigh = list(np.argsort(distanza)[:k])
    y_train[k_neigh]
    occorrenze = np.bincount(y_train[k_neigh], minlength = 3)
    y_new_pred = np.argmax(occorrenze)
    return y_new_pred

def knn_predict(X_test, y_test, k, X_train, y_train):
    y_pred_test = []
    for x in X_test:
        y_pred_test.append(predict_single(x, k, X_train, y_train))
    return np.array(y_pred_test), mae(y_test,y_pred_test)

def mae(y_test,y_pred):
    summ = 0
    for ct in range(len(y_test)):
        summ += int(y_test[ct]!=y_pred[ct])  
    return summ/len(y_test)

def k_range_pred(X_test, y_test, k_range, X_train, y_train):
    for k in k_range:
        calc = knn_predict(X_test, y_test, k, X_train, y_train)
        print("k = ", k,"gave ", calc[0],"with mae = ", calc[1])

In [54]:
def rn_single_pred(x_new, r, X_train, y_train, weighted):
    distanza = [np.linalg.norm(x_new - X_train[ct]) for ct in range(len(X_train))]
    r_nei_num = len(np.array(distanza)[np.array(distanza).astype(float) < r])
    r_neigh = list(np.argsort(distanza)[:r_nei_num])
    if weighted:
        weights = 1/np.array(distanza)[r_neigh]
        occorrenze = np.bincount(y_train[r_neigh], minlength = 3, weights = weights)
    else:
        occorrenze = np.bincount(y_train[r_neigh], minlength = 3)
        
    return np.argmax(occorrenze)

def rn_predict(X_new,y_test, r, X_train, y_train, weighted):
    y_pred = []
    for x_new in X_new:
        y_pred.append(rn_single_pred(x_new, r, X_train, y_train, weighted))
    return y_pred, mae(y_test,y_pred)

def r_range_pred(X_test, y_test, r_range, X_train, y_train):
    for r in r_range:
        calc = rn_predict(X_test, y_test, r, X_train, y_train,True)
        print("r = ", r,"gave ", calc[0],"with mae = ", calc[1])

In [40]:
k_range = np.arange(3,8)
k_range_pred(X_test, y_test, k_range, X_train, y_train)

k =  3 gave  [0 0 1 2 2 0 0 1 0 2 2 0 2 1 1 0 1 2 1 2 1 0 1 1 2 2 1 0 0 2 2 1 1 1 0 0] with mae =  0.2777777777777778
k =  4 gave  [0 0 1 2 2 0 0 1 0 1 2 0 2 1 1 0 1 0 1 1 1 2 1 1 2 2 1 0 0 2 2 1 1 1 0 0] with mae =  0.2222222222222222
k =  5 gave  [0 0 1 2 2 0 0 1 0 1 2 0 2 1 1 0 2 2 1 2 1 0 1 1 2 2 1 0 0 2 0 1 1 1 0 0] with mae =  0.3611111111111111
k =  6 gave  [0 0 1 2 2 0 0 1 0 1 2 0 2 1 1 0 1 2 1 2 1 2 1 1 1 2 1 0 0 2 2 1 1 1 0 0] with mae =  0.25
k =  7 gave  [0 0 1 2 2 0 0 1 0 1 2 0 2 1 1 0 1 0 1 2 1 0 1 1 2 2 1 0 0 2 0 1 2 1 0 0] with mae =  0.2777777777777778


In [57]:
r_range = 10**np.arange(0,6)
r_range_pred(X_test, y_test, r_range, X_train, y_train)

r =  100000 gave  [0, 0, 1, 2, 1, 0, 0, 2, 0, 2, 2, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 0, 1, 1, 2, 2, 1, 0, 0, 2, 2, 1, 1, 1, 0, 2] with mae =  0.3055555555555556
r =  1000000 gave  [0, 0, 1, 2, 1, 0, 0, 2, 0, 2, 2, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 0, 1, 1, 2, 2, 1, 0, 0, 2, 2, 1, 1, 1, 0, 2] with mae =  0.3055555555555556
r =  10000000 gave  [0, 0, 1, 2, 1, 0, 0, 2, 0, 2, 2, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 0, 1, 1, 2, 2, 1, 0, 0, 2, 2, 1, 1, 1, 0, 2] with mae =  0.3055555555555556
