# Lista 3

## `Questão 1`
Considere o conjunto de dados disponível em kc2.csv, organizado em 22 colunas, sendo as 21 primeiras colunas os atributos e a última coluna a saída. Os 21 atributos são referentes à caracterização de códigos-fontes para processamento de dados na NASA. A saída é a indicação de ausência (0) ou existência (1) de defeitos (os dados foram balanceados via subamostragem). Maiores detalhes sobre os dados podem ser conferidos em https://www.openml.org/search?type=data&sort=runs&id=1063&status=active.

A) Considerando uma validação cruzada em 10 folds, avalie modelos de classificação binária nos dados em questão. Para tanto, use as abordagens abaixo:

- **KNN** (escolha k = 1 e k = 5, distância Euclidiana e Mahalonobis, totalizando 4 combinações);
- **Árvore de decisão** (você pode usar uma implementação já existente com índices de impureza de gini e entropia).

In [389]:
from typing import Callable
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from typing import Tuple

np.random.seed(42)

In [390]:
kc2_dataset = np.loadtxt('kc2.csv', delimiter=',')
kc2_dataset.shape

(214, 22)

In [391]:
x_kc2_dataset = kc2_dataset[:, :-1]
x_kc2_dataset.shape

(214, 21)

In [392]:

y_kc2_dataset = kc2_dataset[:, [-1]]
y_kc2_dataset.shape

(214, 1)

In [393]:
def kfold_split(x: np.ndarray, y: np.ndarray, n_partitions: int):
    k_folds = []
    xy = np.random.permutation(np.hstack((x, y)))

    # every fold gets the rounded down number of: number of samples / number of partitions
    for i in range(n_partitions):
        k_folds.append(xy[i*(xy.shape[0] // n_partitions) : (i+1)*(xy.shape[0] // n_partitions), :])

    # The remaining samples get distributed between the first folds, one each
    if(x.shape[0] % n_partitions > 0):
        for i in range(x.shape[0] % n_partitions):
            k_folds[i] = np.vstack((k_folds[i], xy[[-(i + 1)], :]))

    return np.array(k_folds, dtype="object")

def kfold_even_split(x: np.ndarray, y: np.ndarray, n_partitions: int):
    k_folds = []
    xy = np.random.permutation(np.hstack((x, y)))

    # every fold gets the rounded down number of: number of samples / number of partitions
    for i in range(n_partitions):
        k_folds.append(xy[i*(xy.shape[0] // n_partitions) : (i+1)*(xy.shape[0] // n_partitions), :])

    return np.array(k_folds, dtype="object")

def loo_split(x: np.ndarray, y: np.ndarray):
    return kfold_split(x, y, x.shape[0])

In [394]:
def minmax_normalize(x: np.ndarray, y: np.ndarray):
    epsilon = 1e-15

    x_columns_max = x.max(axis=0, keepdims=True)
    ymax = y.max()
    
    x_columns_min = x.min(axis=0, keepdims=True)
    ymin = y.min()

    x = (x - x_columns_min) / (x_columns_max - x_columns_min + epsilon)
    y = (y - ymin) / (ymax - ymin)

    # Values that will be needed for denormalization
    xscale_tuple = (x_columns_max - x_columns_min, x_columns_min + epsilon)
    yscale_tuple = (ymax - ymin, ymin)
    
    return x, y, xscale_tuple, yscale_tuple

def normalize_with_scale(x: np.ndarray, x_scale: Tuple[float, float]):
    return (x - x_scale[0]) / x_scale[1]

def minmax_denormalize(x: np.ndarray, y: np.ndarray, xscale: Tuple[np.ndarray, np.ndarray], yscale: Tuple[float, float]):
    y = y * yscale[0] + yscale[1]
    x = x * xscale[0] + xscale[1]

    return x, y

In [395]:
def euclidian_distances(x: np.ndarray, y: np.ndarray, datapoint: np.ndarray):
    # if(datapoint.shape[0] > 1):
    #     datapoint_distances = []

    #     for line in datapoint:
    #         datapoint_distances.append((((x - line.reshape(1, -1))**2).sum(1))**0.5)
        
        # return np.array(datapoint_distances, dtype="object")
            
    # else:
        return (((x - datapoint)**2).sum(1))**0.5

def manhattam_distances(x: np.ndarray, y: np.ndarray, datapoint: np.ndarray):
    pass

def minkowski_distances(x: np.ndarray, y: np.ndarray, datapoint: np.ndarray):
    pass

def mahalanobis_distances(x: np.ndarray, y: np.ndarray, datapoint: np.ndarray):
    pass

In [396]:
def knn(x: np.ndarray, y: np.ndarray, k: int, dist_func: Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray], datapoints: np.ndarray):
    # 
    y_datapoints = datapoints[:, [-1]]
    datapoints = datapoints[:, :-1]

    # normalize data
    x_normalized, y_normalized, x_scale, y_scale = minmax_normalize(x, y)

    # calculate distances
    datapoints_distances = []

    for line in datapoints:
        datapoints_distances.append(
            np.hstack((dist_func(x_normalized, y_normalized, normalize_with_scale(line, x_scale)).reshape(-1, 1),
                       y))
            )
    
    datapoints_distances = np.array(datapoints_distances, dtype='float64')
    pred_classes = np.zeros((datapoints.shape[0], 2))

    for i, line in enumerate(datapoints_distances):
        # sort by nearest neighbour
        sorted_indices = np.argsort(line[:, 0])
        sorted_datapoint = line[sorted_indices]

        sorted_distances = sorted_datapoint[:, 0]
        sorted_classes = np.array(sorted_datapoint[:, 1], dtype='int64')

        # calculate predicted class
        k_nearest_classes = sorted_classes[:k]
        pred_class = np.bincount(k_nearest_classes).argmax()
        pred_classes[i] = [pred_class.flatten(), y_datapoints.flatten()[i]]
        # pred_classes.append([pred_class, y_datapoints.flatten[i]])
    
    return pred_classes

### A) Considerando uma validação cruzada em 10 folds, avalie modelos de classificação binária nos dados em questão. Para tanto, use as abordagens abaixo:

- ### **KNN** (escolha k = 1 e k = 5, distância Euclidiana e Mahalonobis, totalizando 4 combinações);

In [397]:
kc2_dataset_kfolds = kfold_even_split(x_kc2_dataset, y_kc2_dataset, 10)
kc2_dataset_kfolds.shape

(10, 21, 22)

In [398]:
for each in kc2_dataset_kfolds:
    print(each.shape)

(21, 22)
(21, 22)
(21, 22)
(21, 22)
(21, 22)
(21, 22)
(21, 22)
(21, 22)
(21, 22)
(21, 22)


In [399]:
def stats(y: np.ndarray, pred:np.ndarray):

    true_pos = ((y==1) & (pred == 1)).sum()
    true_neg = ((y==0) & (pred == 0)).sum()
    false_pos = ((y==0) & (pred == 1)).sum()
    false_neg = ((y==1) & (pred == 0)).sum()
    global_acc = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)
    class1_acc = true_pos / (true_pos + false_pos)
    class2_acc = true_neg / (true_neg + false_neg)
    precision = true_pos / (true_pos + false_neg)
    revoke = true_pos / (true_pos + false_pos)

    return global_acc, class1_acc, class2_acc, precision, revoke

In [406]:
print(f"K: 1")

for i, fold in enumerate(kc2_dataset_kfolds):
    if(i == 0):
        kc2_dataset_trainning = kc2_dataset_kfolds[i+1:].reshape((-1, kc2_dataset_kfolds.shape[-1]))
    else:
        kc2_dataset_trainning = np.vstack((kc2_dataset_kfolds[:i], kc2_dataset_kfolds[i+1:])).reshape((-1, kc2_dataset_kfolds.shape[-1]))

    x_kc2_dataset_trainning = kc2_dataset_trainning[:, :-1]
    y_kc2_dataset_trainning = kc2_dataset_trainning[:, [-1]]
    x_kc2_dataset_test = fold[:, :-1]
    y_kc2_dataset_test = fold[:, [-1]]

    # checking if everything has the correct shape
    # print(f"iteration - {i+1}")
    # print(kc2_dataset_trainning.shape)
    # print(x_kc2_dataset_trainning.shape)
    # print(y_kc2_dataset_trainning.shape)
    # print(x_kc2_dataset_test.shape)

    knn_predictions = knn(x_kc2_dataset_trainning, y_kc2_dataset_trainning, 1, euclidian_distances, fold)

    global_acc, class1_acc, class2_acc, precision, revoke = stats(knn_predictions[:, 0], knn_predictions[:, 1])

    print(f"STATS - fold {i}\n")
    print(f"global acc: {global_acc}")
    print(f"class 1 acc: {class1_acc}")
    print(f"class 2 acc: {class2_acc}")
    print(f"precision: {class2_acc}")
    print(f"revoke: {class2_acc}\n\n")

K: 1
STATS - fold 0

global acc: 0.5714285714285714
class 1 acc: 1.0
class 2 acc: 0.0
precision: 0.0
revoke: 0.0


STATS - fold 1

global acc: 0.47619047619047616
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 2

global acc: 0.5714285714285714
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 3

global acc: 0.5714285714285714
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 4

global acc: 0.5238095238095238
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 5

global acc: 0.3333333333333333
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 6

global acc: 0.5714285714285714
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 7

global acc: 0.5714285714285714
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 8

global acc: 0.42857142857142855
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STA

  pred_classes[i] = [pred_class.flatten(), y_datapoints.flatten()[i]]
  precision = true_pos / (true_pos + false_neg)


In [407]:
print(f"K: 5")

for i, fold in enumerate(kc2_dataset_kfolds):
    if(i == 0):
        kc2_dataset_trainning = kc2_dataset_kfolds[i+1:].reshape((-1, kc2_dataset_kfolds.shape[-1]))
    else:
        kc2_dataset_trainning = np.vstack((kc2_dataset_kfolds[:i], kc2_dataset_kfolds[i+1:])).reshape((-1, kc2_dataset_kfolds.shape[-1]))

    x_kc2_dataset_trainning = kc2_dataset_trainning[:, :-1]
    y_kc2_dataset_trainning = kc2_dataset_trainning[:, [-1]]
    x_kc2_dataset_test = fold[:, :-1]
    y_kc2_dataset_test = fold[:, [-1]]

    # checking if everything has the correct shape
    # print(f"iteration - {i+1}")
    # print(kc2_dataset_trainning.shape)
    # print(x_kc2_dataset_trainning.shape)
    # print(y_kc2_dataset_trainning.shape)
    # print(x_kc2_dataset_test.shape)

    knn_predictions = knn(x_kc2_dataset_trainning, y_kc2_dataset_trainning, 5, euclidian_distances, fold)

    global_acc, class1_acc, class2_acc, precision, revoke = stats(knn_predictions[:, 0], knn_predictions[:, 1])

    print(f"STATS - fold {i}\n")
    print(f"global acc: {global_acc}")
    print(f"class 1 acc: {class1_acc}")
    print(f"class 2 acc: {class2_acc}")
    print(f"precision: {class2_acc}")
    print(f"revoke: {class2_acc}\n\n")

K: 1
STATS - fold 0

global acc: 0.5714285714285714
class 1 acc: 1.0
class 2 acc: 0.0
precision: 0.0
revoke: 0.0


STATS - fold 1

global acc: 0.47619047619047616
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 2

global acc: 0.5714285714285714
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 3

global acc: 0.5714285714285714
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 4

global acc: 0.5238095238095238
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 5

global acc: 0.3333333333333333
class 1 acc: 0.0
class 2 acc: 1.0
precision: 1.0
revoke: 1.0


STATS - fold 6

global acc: 0.42857142857142855
class 1 acc: 1.0
class 2 acc: 0.0
precision: 0.0
revoke: 0.0


STATS - fold 7

global acc: 0.42857142857142855
class 1 acc: 1.0
class 2 acc: 0.0
precision: 0.0
revoke: 0.0


STATS - fold 8

global acc: 0.5714285714285714
class 1 acc: 1.0
class 2 acc: 0.0
precision: 0.0
revoke: 0.0


ST

  pred_classes[i] = [pred_class.flatten(), y_datapoints.flatten()[i]]
  precision = true_pos / (true_pos + false_neg)
