# WSI - laboratorium 7

Zadanie: Modele bayesowskie

Autor: Jakub Mazurkiewicz (300226)

## Implementacja naiwnego klasyfikatora Bayesa

In [1]:
import numpy as np

class NaiveBayes:
    def __init__(self, x_train, y_train):
        sample_count, attrib_count = x_train.shape
        self.classes = np.unique(y_train)
        self.class_count = len(self.classes)

        self.mean = np.zeros((self.class_count, attrib_count))
        self.stdev = np.zeros((self.class_count, attrib_count))
        self.prob = np.zeros(self.class_count)

        for i, c in enumerate(self.classes):
            x_of_c = x_train[y_train == c]
            self.mean[i] = x_of_c.mean(axis=0)
            self.stdev[i] = x_of_c.std(axis=0)
            self.prob[i] = x_of_c.shape[0] / sample_count

    def classify(self, x):
        return [self._classify_one(one) for one in x]

    def _classify_one(self, x):
        search_table = [
            self.prob[i] * np.product(self._pdf_normal_dist(i, x))
            for i in range(self.class_count)
        ]
        predicted = np.argmax(search_table)
        return self.classes[predicted]

    def _pdf_normal_dist(self, class_index, x):
        mean = self.mean[class_index]
        stdev = self.stdev[class_index]
        part1 = 1 / (stdev * np.sqrt(2 * np.pi))
        part2 = np.exp(-0.5 * ((x - mean) / stdev) ** 2)
        return part1 * part2

## Przeprowadzanie eksperymentów

### Funkcja wybierająca zbiór uczący

In [2]:
def compose_train_set(data, chunk_size, chunk_to_skip):
    result = []
    for j in range(0, len(data), chunk_size):
        if chunk_to_skip != j:
            result += data[j:j+chunk_size]
    return np.array(result)

### Pojedynczy eksperyment dla zadanych danych

In [3]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

def run_experiment(x, y, chunk_size, chunk_to_skip):
    x_train = compose_train_set(x, chunk_size, chunk_to_skip)
    y_train = compose_train_set(y, chunk_size, chunk_to_skip)
    algo = NaiveBayes(x_train, y_train)
    x_test = x[chunk_to_skip:chunk_to_skip+chunk_size]
    y_test = y[chunk_to_skip:chunk_to_skip+chunk_size]
    results = algo.classify(x_test)

    # TODO: more or less stats?
    print(f'Confusion matrix:\n{confusion_matrix(y_test, results)}')
    print(f'Precision: {list(precision_score(y_test, results, average=None))}')
    accuracy = sum(a == b for a, b in zip(results, y_test)) / len(results)
    print(f'Accuracy: {100 * accuracy:.2f}%')
    print(f'Recall: {list(recall_score(y_test, results, average=None))}')
    return accuracy

### Funkcja przeprowadzająca pełny eksperyment

Parametr `k` oznacza `k`-krotną walidację krzyżową.

In [4]:
from random import Random
from sklearn.datasets import load_iris

def experiment(seed, k):
    iris = load_iris()
    iris_size = len(iris.data)
    if iris_size % k != 0:
        print(f'Please pick `k` such that {iris_size} would be divisible '
              f'by it ({iris_size} % {k} = {iris_size % k})')
    else:
        zipped_iris = list(zip(iris.data, iris.target))
        Random(seed).shuffle(zipped_iris)
        iris.data, iris.target = zip(*zipped_iris)
        chunk_size = iris_size // k
        results = [run_experiment(iris.data, iris.target, chunk_size, i) for i in range(k)]

        print(f'Average accuracy: {100 * np.mean(results):.2f}%')

## Eksperymenty

### Dla `k = 5`

In [5]:
experiment(seed=777, k=5)

Confusion matrix:
[[12  0  0]
 [ 0  7  2]
 [ 0  0  9]]
Precision: [1.0, 1.0, 0.8181818181818182]
Accuracy: 93.33%
Recall: [1.0, 0.7777777777777778, 1.0]
Confusion matrix:
[[12  0  0]
 [ 0  7  3]
 [ 0  0  8]]
Precision: [1.0, 1.0, 0.7272727272727273]
Accuracy: 90.00%
Recall: [1.0, 0.7, 1.0]
Confusion matrix:
[[11  0  0]
 [ 0  7  3]
 [ 0  0  9]]
Precision: [1.0, 1.0, 0.75]
Accuracy: 90.00%
Recall: [1.0, 0.7, 1.0]
Confusion matrix:
[[11  0  0]
 [ 0  7  3]
 [ 0  0  9]]
Precision: [1.0, 1.0, 0.75]
Accuracy: 90.00%
Recall: [1.0, 0.7, 1.0]
Confusion matrix:
[[11  0  0]
 [ 0  6  3]
 [ 0  0 10]]
Precision: [1.0, 1.0, 0.7692307692307693]
Accuracy: 90.00%
Recall: [1.0, 0.6666666666666666, 1.0]
Average accuracy: 90.67%
