# Mid Term 1 - KNN & Logistic Regression
*Authors*:  Aliotta Lorenzo, Prataiolo Loris


In [34]:
import numpy as np
import matplotlib.pyplot as plt
from numpy.random import Generator

In [27]:
ds1 = np.load("synthetic_linear_gaussian/linsep_d2_n200.npz")     # (d = 2 ;  n = 200)
ds2 = np.load("synthetic_linear_gaussian/linsep_d10_n1000.npz")   # (d = 10;  n = 1000)
ds3 = np.load("synthetic_linear_gaussian/linsep_d500_n50000.npz") # (d = 500; n = 50000)

In [28]:
print(ds1.files)

['X_train', 'y_train', 'X_test', 'y_test', 'flip_mask', 'noise_rate', 'seed', 'feature_dim', 'n_samples', 'test_size']


In [37]:
print(ds1["feature_dim"])

[2]


# KNN

In [29]:
def euclidDistance(P1,P2):
    return np.linalg.norm(P1-P2,2)

In [30]:
def calcError(Ypred, Ytrue):
    return (np.count_nonzero(Ypred != Ytrue)) / len(Ytrue)

In [31]:
def allDistances(X1, X2):
    '''
    Compute pairwise Euclidean distnaces between two X sets.
    output: D :=  A matrix (X1_rows x X2_rows) where D[i, j] equals the Euclidean distance 
                  between the i-th sample in X1 and the j-th sample in X2.
    '''
    D = np.zeros((X1.shape[0], X2.shape[0]))

    for i in range(X1.shape[0]):
        for j in range (X2.shape[0]):
            D[i,j] = euclidDistance(X1[i,:], X2[j,:])
            
    return D

In [32]:
def kNNClassify(Xtr, Ytr, k, Xte):

    n_train = Xtr.shape[0]  # number of the training inputs
    n_test  = Xte.shape[0]  # number of the test inputs

    # sanity checks
    if any(np.abs(Ytr) != 1):
        raise Exception("The values of Ytr should be +1 or -1.")

    if k > n_train:
        print("k is greater than the number of points, setting k=n_train")
        k = n_train

    Ypred = np.zeros(n_test)

    # Compute all the distances from TEST input and TRAINING input
    dist = allDistances(Xte,Xtr)
    
    # For each test point, the predicted class will be 
    # the sign of the average label of the k nearest points
    for idx in range(n_test):

        # Take all distances for the current test point and sort them 
        idx_dist = dist[idx, :]     
        idx_sort = np.argsort(idx_dist)
        
        # Get the indices of the k lowest of distances
        k_dist_index = idx_sort[:k]
        
        # Compute the mean of the output values of the corresponding training points and taking the sign of it.
        mean = np.mean(Ytr[k_dist_index])

        Ypred[idx] = np.sign(mean) # returns -1 if x < 0, 0 if x==0, 1 if x > 0
    return Ypred

In [35]:
def KFoldCVkNN(Xtr, Ytr, num_folds, k_list, rng: int | Generator | None = None):
    """Run K-Fold CV for the kNN model
    
    Parameters:
    -----------
     - Xtr : np.array
         the full training set data
     - Ytr : np.array
         the full training set labels
     - num_folds : int
         the number of folds
     - k_list : List[int]
         the values of k (for k-NN) to try
     - rng : Optional[int | Generator | None]
         optional random state

    Returns:
    --------
     - best_k : int
         The value of k (in k_list) which obtains the best average validation error
     - best_k_idx : int
         The index of the best_k element in k_list
     - tr_err_mean : np.array
         A 1D array of the same length as k_list, with the average training error for each tested k.
     - tr_err_std : np.array
         A 1D array of the same length as k_list, with the standard deviation 
         of the training error for each tested k.
     - val_err_mean : np.array
         A 1D array of the same length as k_list, with the average validation error for each tested k.
     - val_err_std : np.array
         A 1D array of the same length as k_list, with the standard deviation
         of the validation error for each tested k.
    """
    rng = _check_random_generator(rng)
    # Ensures that k_list is a numpy array
    k_list = np.array(k_list)
    num_k = len(k_list)

    n_tot = Xtr.shape[0]

    # We want to compute 1 error for each `k` and each fold
    tr_errors = np.zeros((num_k, num_folds)) # train
    val_errors = np.zeros((num_k, num_folds)) # validation

    # `split_idx`: a list of arrays, each containing the validation indices for 1 fold
    rand_idx = rng.choice(n_tot, size=n_tot, replace=False)
    split_idx = np.array_split(rand_idx, num_folds) 
    
    for fold_idx in range(num_folds):
        # Set the indices in boolean mask for all validation samples to `True`
        val_mask = np.zeros(n_tot, dtype=bool)
        val_mask[split_idx[fold_idx]] = True
        
        # Split training set in training part and validation part
        x_train = Xtr[val_mask==False]
        y_train = Ytr[val_mask==False]
        x_val = Xtr[val_mask==True]
        y_val = Ytr[val_mask==True]
        
        for k_idx, current_k in enumerate(k_list):
            # TODO: Compute the training error of the kNN classifier for the given value of k
            Tpred = kNNClassify(x_train, y_train, current_k, x_train)
            tr_errors[k_idx, fold_idx] = calcError(Tpred, y_train)
            
            # TODO: Compute the validation error of the kNN classifier for the given value of k
            Vpred = kNNClassify(x_train, y_train, current_k, x_val)
            val_errors[k_idx, fold_idx] = calcError(Vpred, y_val)
            
    # Calculate error statistics along the repetitions:
    # 1) mean training error, training error standard deviation
    tr_err_mean = np.mean(tr_errors, axis=1)
    tr_err_std = np.std(tr_errors, axis=1)
    # 2) mean validation error, validation error standard deviation
    val_err_mean = np.mean(val_errors, axis=1)
    val_err_std = np.std(val_errors, axis=1)
    # 3) best k (k which minimize mean validation error) and index of best k in k_list
    best_k = k_list[np.argmin(val_err_mean)]
    best_k_idx = np.where(k_list == best_k)
    
    return best_k, best_k_idx, tr_err_mean, tr_err_std, val_err_mean, val_err_std

In [None]:
def perform_knn_process(dataset):

    # Get all data
    Xtr = dataset['X_train']
    Xte = dataset['X_test']
    Ytr = dataset['y_train']
    Yte = dataset['y_test']

    
    

# Logistic Regression