In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

### 1ª QUESTÃO

Considere o conjunto de dados disponível em kc2.csv, organizado em 22 colunas, sendo as 21 primeiras colunas os atributos e a última coluna a saída. Os 21
atributos são referentes à caracterização de códigos-fontes para processamento
de dados na NASA. A saída é a indicação de ausência (0) ou existência (1) de
defeitos (os dados foram balanceados via subamostragem). Maiores detalhes
sobre os dados podem ser conferidos em https://www.openml.org/search?
type=data&sort=runs&id=1063&status=active.

a) Considerando uma validação cruzada em 10 folds, avalie modelos de classificação binária nos dados em questão. Para tanto, use as abordagens
abaixo:

- KNN (escolha k = 1 e k = 5, distância Euclidiana e Mahalonobis,
totalizando 4 combinações);
- Árvore de decisão (você pode usar uma implementação já existente, como a do scikit-learn, com índices de impureza de gini e entropia).

b) Para cada modelo criado, reporte valor médio e desvio padrão das métricas
de acurácia, revocação, precisão e F1-score.


In [11]:
#KNN

def load_data(filepath):
    data = np.loadtxt(filepath, delimiter=',')
    X = data[:, :-1]
    y = data[:, -1]
    return X, y

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def mahalanobis_distance(x1, x2, VI):
    delta = x1 - x2
    return np.sqrt(np.dot(np.dot(delta, VI), delta.T))

class KNN:

    def __init__(self, k, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None
        self.S_inv = None

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        if self.distance_metric == 'mahalanobis':
            cov_matrix = np.cov(X_train, rowvar=False) + np.eye(X_train.shape[1]) * 1e-6
            self.S_inv = np.linalg.inv(cov_matrix)

    def predict(self, X_test):
        predictions = [self._predict_single(x) for x in X_test]
        return np.array(predictions)

    def _predict_single(self, x):
        distances = []
        for i, x_train in enumerate(self.X_train):
            if self.distance_metric == 'euclidean':
                dist = euclidean_distance(x, x_train)
            elif self.distance_metric == 'mahalanobis':
                dist = mahalanobis_distance(x, x_train, self.S_inv)
            else:
                raise ValueError("Métrica de distância não suportada. Escolha 'euclidean' ou 'mahalanobis'.")
            distances.append((dist, self.y_train[i]))

        distances.sort(key=lambda x: x[0])
        k_nearest_neighbors = distances[:self.k]

        k_nearest_labels = [label for dist, label in k_nearest_neighbors]

        unique_labels, counts = np.unique(k_nearest_labels, return_counts=True)
        return unique_labels[np.argmax(counts)]

def calculate_metrics(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0

    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return accuracy, recall, precision, f1_score


def k_fold_cross_validation(X, y, k_folds=10):
    n_samples = X.shape[0]
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    fold_size = n_samples // k_folds
    folds = []
    for i in range(k_folds):
        start = i * fold_size
        end = (i + 1) * fold_size if i < k_folds - 1 else n_samples
        folds.append(indices[start:end])
    return folds

if __name__ == "__main__":
    filepath = 'kc2.csv'
    X, y = load_data(filepath)

    k_folds = 10
    folds = k_fold_cross_validation(X, y, k_folds)

    knn_configs = [
        {'k': 1, 'distance': 'euclidean'},
        {'k': 5, 'distance': 'euclidean'},
        {'k': 1, 'distance': 'mahalanobis'},
        {'k': 5, 'distance': 'mahalanobis'}
    ]

    results = {}

    for config in knn_configs:
        k_val = config['k']
        dist_metric = config['distance']
        print(f"\nAvaliação para k={k_val}, Métrica: {dist_metric}")

        accuracies = []
        recalls = []
        precisions = []
        f1_scores = []

        for i, test_indices in enumerate(folds):
            train_indices = np.concatenate([f for j, f in enumerate(folds) if j != i])

            X_train, y_train = X[train_indices], y[train_indices]
            X_test, y_test = X[test_indices], y[test_indices]

            knn_model = KNN(k=k_val, distance_metric=dist_metric)
            knn_model.fit(X_train, y_train)

            y_pred = knn_model.predict(X_test)

            acc, rec, prec, f1 = calculate_metrics(y_test, y_pred)
            accuracies.append(acc)
            recalls.append(rec)
            precisions.append(prec)
            f1_scores.append(f1)

            print(f"  Fold {i+1}: Acc={acc:.4f}, Rec={rec:.4f}, Prec={prec:.4f}, F1={f1:.4f}")

        results[f'k={k_val}, dist={dist_metric}'] = {
            'accuracy': {'mean': np.mean(accuracies), 'std': np.std(accuracies)},
            'recall': {'mean': np.mean(recalls), 'std': np.std(recalls)},
            'precision': {'mean': np.mean(precisions), 'std': np.std(precisions)},
            'f1_score': {'mean': np.mean(f1_scores), 'std': np.std(f1_scores)}
        }

    print("\nResultados:")
    for config_str, metrics in results.items():
        print(f"\nConfiguração: {config_str}")
        print(f"  Acurácia: Média={metrics['accuracy']['mean']:.4f}, Desvio Padrão={metrics['accuracy']['std']:.4f}")
        print(f"  Revocação: Média={metrics['recall']['mean']:.4f}, Desvio Padrão={metrics['recall']['std']:.4f}")
        print(f"  Precisão: Média={metrics['precision']['mean']:.4f}, Desvio Padrão={metrics['precision']['std']:.4f}")
        print(f"  F1-Score: Média={metrics['f1_score']['mean']:.4f}, Desvio Padrão={metrics['f1_score']['std']:.4f}")




Avaliação para k=1, Métrica: euclidean
  Fold 1: Acc=0.6667, Rec=0.5556, Prec=0.6250, F1=0.5882
  Fold 2: Acc=0.6190, Rec=0.5556, Prec=0.5556, F1=0.5556
  Fold 3: Acc=0.6190, Rec=0.5000, Prec=0.6250, F1=0.5556
  Fold 4: Acc=0.7619, Rec=0.7143, Prec=0.6250, F1=0.6667
  Fold 5: Acc=0.7143, Rec=0.8182, Prec=0.6923, F1=0.7500
  Fold 6: Acc=0.5714, Rec=0.4286, Prec=0.8571, F1=0.5714
  Fold 7: Acc=0.8095, Rec=0.7000, Prec=0.8750, F1=0.7778
  Fold 8: Acc=0.6667, Rec=0.6364, Prec=0.7000, F1=0.6667
  Fold 9: Acc=0.5714, Rec=0.6364, Prec=0.5833, F1=0.6087
  Fold 10: Acc=0.6800, Rec=0.8000, Prec=0.7059, F1=0.7500

Avaliação para k=5, Métrica: euclidean
  Fold 1: Acc=0.6667, Rec=0.6667, Prec=0.6000, F1=0.6316
  Fold 2: Acc=0.7619, Rec=0.7778, Prec=0.7000, F1=0.7368
  Fold 3: Acc=0.7143, Rec=0.7000, Prec=0.7000, F1=0.7000
  Fold 4: Acc=0.9048, Rec=0.8571, Prec=0.8571, F1=0.8571
  Fold 5: Acc=0.7143, Rec=0.7273, Prec=0.7273, F1=0.7273
  Fold 6: Acc=0.6190, Rec=0.5000, Prec=0.8750, F1=0.6364
  Fold 

In [12]:
#árvore de decisão

def load_data(filepath):
    data = np.loadtxt(filepath, delimiter=',')
    X = data[:, :-1]
    y = data[:, -1]
    return X, y

def calculate_metrics(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0

    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return accuracy, recall, precision, f1_score

if __name__ == "__main__":
    filepath = 'kc2.csv'
    X, y = load_data(filepath)

    k_folds = 10

    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    dt_configs = [
        {'criterion': 'gini'},
        {'criterion': 'entropy'}
    ]

    results = {}

    for config in dt_configs:
        criterion_metric = config['criterion']
        print(f"\nAvaliação para Árvore de Decisão, Critério: {criterion_metric}")

        accuracies = []
        recalls = []
        precisions = []
        f1_scores = []

        for i, (train_index, test_index) in enumerate(skf.split(X, y)):
            X_train, y_train = X[train_index], y[train_index]
            X_test, y_test = X[test_index], y[test_index]

            dt_model = DecisionTreeClassifier(criterion=criterion_metric, random_state=42)
            dt_model.fit(X_train, y_train)

            y_pred = dt_model.predict(X_test)

            acc, rec, prec, f1 = calculate_metrics(y_test, y_pred)
            accuracies.append(acc)
            recalls.append(rec)
            precisions.append(prec)
            f1_scores.append(f1)

            print(f"  Fold {i+1}: Acc={acc:.4f}, Rec={rec:.4f}, Prec={prec:.4f}, F1={f1:.4f}")

        results[f'DT, criterion={criterion_metric}'] = {
            'accuracy': {'mean': np.mean(accuracies), 'std': np.std(accuracies)},
            'recall': {'mean': np.mean(recalls), 'std': np.std(recalls)},
            'precision': {'mean': np.mean(precisions), 'std': np.std(precisions)},
            'f1_score': {'mean': np.mean(f1_scores), 'std': np.std(f1_scores)}
        }

    print("\nResultados:")
    for config_str, metrics in results.items():
        print(f"\nConfiguração: {config_str}")
        print(f"  Acurácia: Média={metrics['accuracy']['mean']:.4f}, Desvio Padrão={metrics['accuracy']['std']:.4f}")
        print(f"  Revocação: Média={metrics['recall']['mean']:.4f}, Desvio Padrão={metrics['recall']['std']:.4f}")
        print(f"  Precisão: Média={metrics['precision']['mean']:.4f}, Desvio Padrão={metrics['precision']['std']:.4f}")
        print(f"  F1-Score: Média={metrics['f1_score']['mean']:.4f}, Desvio Padrão={metrics['f1_score']['std']:.4f}")


Avaliação para Árvore de Decisão, Critério: gini
  Fold 1: Acc=0.6364, Rec=0.4545, Prec=0.7143, F1=0.5556
  Fold 2: Acc=0.5909, Rec=0.5455, Prec=0.6000, F1=0.5714
  Fold 3: Acc=0.7727, Rec=0.7273, Prec=0.8000, F1=0.7619
  Fold 4: Acc=0.4545, Rec=0.3636, Prec=0.4444, F1=0.4000
  Fold 5: Acc=0.7619, Rec=0.8000, Prec=0.7273, F1=0.7619
  Fold 6: Acc=0.8095, Rec=0.8000, Prec=0.8000, F1=0.8000
  Fold 7: Acc=0.8095, Rec=0.7000, Prec=0.8750, F1=0.7778
  Fold 8: Acc=0.8571, Rec=0.8182, Prec=0.9000, F1=0.8571
  Fold 9: Acc=0.7143, Rec=0.7273, Prec=0.7273, F1=0.7273
  Fold 10: Acc=0.5714, Rec=0.5455, Prec=0.6000, F1=0.5714

Avaliação para Árvore de Decisão, Critério: entropy
  Fold 1: Acc=0.6364, Rec=0.5455, Prec=0.6667, F1=0.6000
  Fold 2: Acc=0.5455, Rec=0.5455, Prec=0.5455, F1=0.5455
  Fold 3: Acc=0.7273, Rec=0.7273, Prec=0.7273, F1=0.7273
  Fold 4: Acc=0.4091, Rec=0.3636, Prec=0.4000, F1=0.3810
  Fold 5: Acc=0.8571, Rec=0.9000, Prec=0.8182, F1=0.8571
  Fold 6: Acc=0.8571, Rec=0.9000, Prec=0.