In [None]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin

class NBCDiscrete(BaseEstimator, ClassifierMixin):

    def __init__(self, domains, laplace=False, logs=False):
        self.laplace = laplace
        self.logs = logs
        self.domains = domains
        self.eps = 1e-9
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        K = self.classes_.size # 3
        m, n = X.shape # 134, 13
        q = np.max(self.domains) # 5
        self.PY_ = np.zeros(K) # a priori distribution
        self.P_ = np.zeros((K, n, q)) # conditional probs -> P_[2, 7, 3] = Prob(X_7 = 3 | y = 2)

        yy = np.zeros(m, dtype=np.int8)
        for y_index, label in enumerate(self.classes_):
            indexes = y == label
            yy[indexes] = y_index
            self.PY_[y_index] = np.mean(y == label)
        for i in range(m):
            for j in range(n):
                self.P_[yy[i], j, X[i, j]] += 1
        for y_index, label in enumerate(self.classes_):
            if not self.laplace:
                self.P_[y_index] /= self.PY_[y_index] * m
            else:
                for j in range(n):
                    self.P_[y_index, j] = (self.P_[y_index, j] + 1) / (self.PY_[y_index] * m + self.domains[j])
        if self.logs: #ig to jest sposób na trzymanie logów apriori i warunkowych pr.
            self.P_LOGS = np.log(self.P_ + self.eps)
            self.PY_LOGS = np.log(self.PY_ + self.eps)

    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), axis=1)]

    def predict_proba(self, X):
        m, n = X.shape
        K = self.classes_.size

        if self.logs:
            scores = np.zeros((m, K))
            for i in range(m):
                #apriori
                for y_index in range(K):
                    scores[i, y_index] += self.PY_LOGS[y_index]
                    #warunkowe
                    for j in range(n):
                        scores[i, y_index] += self.P_LOGS[y_index, j, X[i, j]]


        else:
            scores = np.ones((m, K))
            for i in range(m):
                for y_index in range(K):
                    for j in range(n):
                        scores[i, y_index] *= self.P_[y_index, j, X[i, j]]
                    scores *= self.PY_[y_index]
            #normalizacja
            #mins_ref = np.min(scores, axis=0)
            #maxes_ref = np.max(scores, axis=0)
            #scores = np.clip(np.floor((scores - mins_ref) / (maxes_ref - mins_ref)), 0, 1)
            #musiałem tak zrobić bo dzielenie przez zero mi się pokazywało
            score_range = np.max(scores) - np.min(scores)
            if score_range == 0:
                scores = np.ones_like(scores)
            else:
                scores = (scores - np.min(scores)) / score_range

        return scores

In [None]:
import numpy as np

def read_wine_data(path):
    D = np.genfromtxt(path, delimiter=",")
    X = D[:, 1:]
    y = D[:, 0].astype(np.int8)
    return X, y

def train_test_split(X, y, train_ratio=0.75):
    m = X.shape[0]
    indexes = np.random.permutation(m)
    X = X[indexes]
    y = y[indexes]
    i = int(np.round(train_ratio * m))
    X_train = X[:i]
    y_train = y[:i]
    X_test = X[i:]
    y_test = y[i:]
    return X_train, y_train, X_test, y_test

def discretize(X, bins, mins_ref=None, maxes_ref=None):
    if mins_ref is None:
        mins_ref = np.min(X, axis=0)
        maxes_ref = np.max(X, axis=0)
    X_d = np.clip(np.floor((X - mins_ref) / (maxes_ref - mins_ref) * bins), 0, bins - 1).astype(np.int8)
    return X_d, mins_ref, maxes_ref

if __name__ == '__main__':
    bins = 5 #to jest nasza liczba przedziałów dla zmiennych ciągłych zamienianych na dyskretne
    np.random.seed(1)
    X, y = read_wine_data("/wine.data") #tu wchodzą nasze dane

    X_train, y_train, X_test, y_test = train_test_split(X, y, train_ratio=0.8) #dzielimy na część treningową i testową

    X_train_d, mins_ref, maxes_ref = discretize(X_train, bins) #dyskretyzcja
    X_test_d, _, _ = discretize(X_test, bins, mins_ref, maxes_ref)

    n = X.shape[1]
    domains = np.ones(n, dtype=np.int8) * bins # [5, ..., 5] x 13
    clf = NBCDiscrete(domains, laplace=True)
    clf.fit(X_train_d, y_train)

    print(clf.P_)
    acc_train = clf.score(X_train_d, y_train)
    acc_test = clf.score(X_test_d, y_test)
    print(f"ACC -> TRAIN: {acc_train}, TEST: {acc_test}")
    #znajdujemy zbiór który się podoba, musi spełniać cechy z zmsi, 1000 przykładów 20 atrybutów, zobaczyć które są zmienne które ciągłe, zdyskretyzować tylko te ciągłe, wczytać się gdzie jest atrybut z klasą, dorobić logarytmowanie obliczeń logs=false
    #scores if else dla logarytmowania np.log ew trzymać tablicę logów,  chodzi o bezpieczeństwo numeryczne sztuczny test np.tile()

[[[0.01818182 0.01818182 0.21818182 0.45454545 0.29090909]
  [0.45454545 0.36363636 0.05454545 0.10909091 0.01818182]
  [0.01818182 0.01818182 0.21818182 0.47272727 0.27272727]
  [0.09090909 0.56363636 0.29090909 0.03636364 0.01818182]
  [0.01818182 0.49090909 0.4        0.07272727 0.01818182]
  [0.01818182 0.03636364 0.34545455 0.52727273 0.07272727]
  [0.01818182 0.01818182 0.14545455 0.52727273 0.29090909]
  [0.21818182 0.6        0.12727273 0.03636364 0.01818182]
  [0.01818182 0.30909091 0.49090909 0.14545455 0.03636364]
  [0.03636364 0.58181818 0.30909091 0.05454545 0.01818182]
  [0.01818182 0.05454545 0.47272727 0.41818182 0.03636364]
  [0.01818182 0.01818182 0.21818182 0.49090909 0.25454545]
  [0.01818182 0.10909091 0.4        0.34545455 0.12727273]]

 [[0.33870968 0.5        0.11290323 0.03225806 0.01612903]
  [0.56451613 0.22580645 0.11290323 0.06451613 0.03225806]
  [0.03225806 0.24193548 0.35483871 0.27419355 0.09677419]
  [0.03225806 0.27419355 0.5        0.12903226 0.06451

In [None]:
import numpy as np

#wartości w zbiorze do 10000, wymagany int16
def read_student_data(path):
    raw_X = np.genfromtxt(path, delimiter=";", usecols=range(0, 36))
    raw_Y = np.genfromtxt(path, delimiter=";", dtype=str, usecols=(-1)) #np.genfromtxt , dtype sobie automatycznie interpretuje kolumny
    X = raw_X[1:, :]
    y_labels_dict = {"Dropout" : 1, "Enrolled" : 2, "Graduate": 3}
    y = np.array([y_labels_dict[value] for value in raw_Y[1:]]).astype(np.int16)
    return X, y

def train_test_split(X, y, train_ratio=0.75):
    m = X.shape[0]
    indexes = np.random.permutation(m)
    X = X[indexes]
    y = y[indexes]
    i = int(np.round(train_ratio * m))
    X_train = X[:i]
    y_train = y[:i]
    X_test = X[i:]
    y_test = y[i:]
    return X_train, y_train, X_test, y_test

def discretize(X, bins, mins_ref=None, maxes_ref=None):
    if mins_ref is None:
        mins_ref = np.min(X, axis=0)
        maxes_ref = np.max(X, axis=0)
    X_d = np.clip(np.floor((X - mins_ref) / (maxes_ref - mins_ref) * bins), 0, bins - 1).astype(np.int16)
    return X_d, mins_ref, maxes_ref


if __name__ == '__main__':
    bins = [2, 5, 10, 25, 50, 100, 250, 500, 1000]
    #np.random.seed(1)

    X, y = read_student_data("/data.csv")
    print(X.shape)
    results = np.empty((len(bins), 3))
    X_train, y_train, X_test, y_test = train_test_split(X, y, train_ratio=0.8) #dzielimy na część treningową i testową
    for index, bin_value in enumerate(bins):
        X_train_d, mins_ref, maxes_ref = discretize(X_train, bin_value) #zmienne ciągłe mają kolumny  [6, 12, 33, 34, 35] jakby co
        X_test_d, _, _ = discretize(X_test, bin_value, mins_ref, maxes_ref)

        n = X.shape[1]
        domains = np.ones(n, dtype=np.int16) * bin_value
        clf = NBCDiscrete(domains, laplace=True, logs=True)

        clf.fit(X_train_d, y_train)
        #print(clf.P_)

        acc_train = clf.score(X_train_d, y_train)
        acc_test = clf.score(X_test_d, y_test)
        results[index] = (bin_value, acc_train, acc_test)
        #
        print(clf.PY_) #mamy zysk więc to jest korzystny klasyfikator
    print(results)

(4424, 36)
[0.32495055 0.17660356 0.49844589]
[0.32495055 0.17660356 0.49844589]
[0.32495055 0.17660356 0.49844589]
[0.32495055 0.17660356 0.49844589]
[0.32495055 0.17660356 0.49844589]
[0.32495055 0.17660356 0.49844589]
[0.32495055 0.17660356 0.49844589]
[0.32495055 0.17660356 0.49844589]
[0.32495055 0.17660356 0.49844589]
[[2.00000000e+00 6.86634643e-01 6.79096045e-01]
 [5.00000000e+00 7.37213902e-01 7.10734463e-01]
 [1.00000000e+01 7.29867194e-01 6.92655367e-01]
 [2.50000000e+01 7.36931337e-01 6.98305085e-01]
 [5.00000000e+01 7.50211924e-01 7.12994350e-01]
 [1.00000000e+02 7.55863238e-01 7.19774011e-01]
 [2.50000000e+02 7.37213902e-01 7.01694915e-01]
 [5.00000000e+02 7.21672789e-01 7.02824859e-01]
 [1.00000000e+03 7.08392201e-01 6.92655367e-01]]
