## MISA (2024-2025)
- Alohan'ny mamerina dia avereno atao Run ny notebook iray manontolo. Ny fanaovana azy dia redémarrena mihitsy ny kernel aloha (jereo menubar, safidio **Kernel$\rightarrow$Restart Kernel and Run All Cells**).

- Izay misy hoe `YOUR CODE HERE` na `YOUR ANSWER HERE` ihany no fenoina. Afaka manampy cells vaovao raha ilaina. Aza adino ny mameno references eo ambany raha ilaina.

## References
* [Naive Bayes Documentation](https://scikit-learn.org/1.5/modules/naive_bayes.html)
* [Scikit-learn QDA Documentation](https://scikit-learn.org/0.16/modules/generated/sklearn.qda.QDA.html)
* [Machine Learning Mastery - Gaussian Discriminant Analysis](https://machinelearningmastery.com/linear-discriminant-analysis-with-python/)
* [Gaussian Discriminant Analysis on Wikipedia](https://en.wikipedia.org/wiki/Linear_discriminant_analysis)
* [Bernoulli Naive Bayes on Scikit-learn ](https://scikit-learn.org/1.5/modules/generated/sklearn.naive_bayes.BernoulliNB.html)

---

## DO NOT USE FOR LOOP ON number of samples N but ONLY ON number of classes C

In [56]:
import numpy as np
from sklearn.datasets import load_iris, load_digits, load_digits
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.preprocessing import Binarizer

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Gaussian Discriminant Analysis

In [57]:
data = load_iris()
X_train, y_train = data.data, data.target

In [58]:
def compute_priors(X, y):
    """
    Prior probability for each class 
    
    Inputs:
    - X: array of shape (N, D) 
    - y: array of shape (N,) 

    Returns:
    - priors : array of shape (C,)
    """
    C = np.max(y) + 1  # Nombre de classes
    priors = np.zeros(C)

    # Calculer le nombre d'occurrences pour chaque classe
    for i in range(C):
        priors[i] = np.sum(y == i) / len(y)
    
    return priors

    raise NotImplementedError()

In [59]:
sk_model = QuadraticDiscriminantAnalysis()
sk_model.fit(X_train, y_train)

priors = compute_priors(X_train, y_train)
error = rel_error(sk_model.priors_, priors)
print(error)
assert  error < 1e-12

0.0


In [60]:
def compute_means(X, y):
    """
    Mean estimate for each class, NO FOR LOOP ON number of samples N but ONLY ON number of classes C
    
    Inputs:
    - X: array of shape (N, D) 
    - y: array of shape (N,) 

    Returns:
    - means : array of shape (C, D)
    """
    N, D = X.shape    
    C = np.max(y) + 1  # Nombre de classes
    means = np.zeros((C, D))

    # Calculer les moyennes pour chaque classe
    for i in range(C):
        # On extrait toutes les lignes correspondant à la classe i et on calcule la moyenne
        means[i] = np.mean(X[y == i], axis=0)

    return means

In [61]:
sk_model = QuadraticDiscriminantAnalysis()
sk_model.fit(X_train, y_train)

means = compute_means(X_train, y_train)
error = rel_error(sk_model.means_, means)
print(error)
assert  error < 1e-12

0.0


In [62]:
import numpy as np

def compute_sigmas_gda(X, y, means):
    """
    Covariance estimate for each class, NO FOR LOOP ON number of samples N but ONLY ON number of classes C
    DO NOT USE np.cov
    
    Inputs:
    - X: array of shape (N, D) 
    - y: array of shape (N,) 
    - means: array of shape (C, D)

    Returns:
    - covariances : array of shape (C, D, D)
    """
    N, D = X.shape    
    C = np.max(y) + 1  # Nombre de classes
    covariances = np.zeros((C, D, D))
    
    # Calculer la différence entre X et les moyennes de chaque classe
    for i in range(C):
        # Extraire les indices des échantillons appartenant à la classe i
        class_samples = X[y == i]  # Shape: (N_i, D)
        
        # Calculer la covariance de la classe i
        diff = class_samples - means[i]  # Différence entre les échantillons et la moyenne de la classe
        covariances[i] = np.dot(diff.T, diff) / (len(class_samples)- 1)  # Produit scalaire et normalisation

    return covariances


In [63]:
sk_model = QuadraticDiscriminantAnalysis(store_covariance=True)
sk_model.fit(X_train, y_train)

covariances = compute_sigmas_gda(X_train, y_train, sk_model.means_)
error = rel_error(np.asarray(sk_model.covariance_), covariances)
print(error)
assert  error < 1e-12

1.4271848449270932e-15


In [64]:
def compute_sigma_lda(X, y, means):
    """
    Covariance estimate for LDA, NO FOR LOOP ON number of samples N but ONLY ON number of classes C
    DO NOT USE np.cov
    """
    N, D = X.shape    
    C = np.max(y) + 1  # Nombre de classes
    
    # Matrice one-hot pour les classes
    one_hot = np.eye(C)[y]  # Shape: (N, C)
    
    # Calcul des écarts entre les données et les moyennes de classe
    deviations = X[:, np.newaxis, :] - means[np.newaxis, :, :]  # Shape: (N, C, D)
    
    # Pondérer les écarts par la classe correspondante
    weighted_deviations = one_hot[:, :, np.newaxis] * deviations  # Shape: (N, C, D)
    
    # Somme pondérée des covariances intra-classes
    covariance = np.einsum('ncd,nce->de', weighted_deviations, deviations) / N  # Shape: (D, D)
    
    return covariance


In [65]:
sk_model = LinearDiscriminantAnalysis(store_covariance=True)
sk_model.fit(X_train, y_train)

covariances = compute_sigma_lda(X_train, y_train, sk_model.means_)
error = rel_error(np.asarray(sk_model.covariance_), covariances)
print(error)
assert  error < 1e-12

6.13626981244007e-16


In [66]:
def compute_log_posterior_lda(X, C, priors, means, covariance):
    """
    Covariance log posterior for each class and observation.
    """
    N, D = X.shape
    log_posterior = np.zeros((N, C))
    
    # Inverse de la matrice de covariance partagée
    covariance_inv = np.linalg.inv(covariance)  # Shape: (D, D)
    
    # Calcul des poids W et des biais b
    W = np.dot(means, covariance_inv.T)  # Shape: (C, D)
    b = np.zeros(C)  # Initialisation des biais
    
    for c in range(C):
        mean_c = means[c]
        # b_c = -0.5 * mu_c^T * Sigma_inv * mu_c + log(prior_c)
        b[c] = -0.5 * np.dot(mean_c, np.dot(covariance_inv, mean_c)) + np.log(priors[c])
    
    # Calcul des log-postérieurs
    # log_posterior[n, c] = X[n] dot W[c] + b[c]
    log_posterior = np.dot(X, W.T) + b  # Shape: (N, C)
    
    return log_posterior


In [67]:
# NO TEST FOR LOG-POSTERIOR LDA. Mitambatra eo ambany ny test

In [68]:
def compute_log_posterior_gda(X, C, priors, means, covariances):
    """
    Covariance log posterior for each class and observation in GDA.
    """
    N, D = X.shape
    log_posterior = np.zeros((N, C))
    
    for c in range(C):
        # Extraire la covariance et sa décomposition
        covariance_c = covariances[c]  # Shape: (D, D)
        covariance_inv = np.linalg.inv(covariance_c)  # Inverse de la covariance
        log_det_covariance = np.log(np.linalg.det(covariance_c))  # Logarithme du déterminant
        
        # Moyenne de la classe
        mean_c = means[c]  # Shape: (D,)
        
        # Calcul de la contribution quadratique pour chaque observation
        diff = X - mean_c  # Shape: (N, D)
        quadratic_term = -0.5 * np.sum(diff @ covariance_inv * diff, axis=1)  # Shape: (N,)
        
        # Ajouter le terme constant et le log-prior
        constant_term = -0.5 * log_det_covariance + np.log(priors[c])  # Scalaire
        log_posterior[:, c] = quadratic_term + constant_term  # Shape: (N,)
    
    return log_posterior


In [69]:
sk_model = QuadraticDiscriminantAnalysis(store_covariance=True)
sk_model.fit(X_train, y_train)

C = (np.max(y_train) + 1)
log_posterior = compute_log_posterior_gda(X_train, C, sk_model.priors_, sk_model.means_, sk_model.covariance_)
error = rel_error(np.asarray(sk_model._decision_function(X_train)), log_posterior)
print(error)
assert  error < 1e-12

1.5243557983786438e-14


In [70]:
class ProbClassifier():
    def fit(self, X, y):
        pass
    
    def compute_log_posterior(self, X):
        pass
    
    def predict(self, X):
        log_post = self.compute_log_posterior(X)
        return np.argmax(log_post, axis=1)
    
    def predict_proba(self, X):
        log_post = self.compute_log_posterior(X)
        return np.exp(log_post) / np.sum(np.exp(log_post), axis=1, keepdims=True)


In [71]:
class LDA(ProbClassifier):
    def __init__(self):
        self.priors = None
        self.means = None
        self.cov = None
        self.C = None
    
    def fit(self, X, y):
        """
        Entraîne le classificateur LDA sur les données d'entraînement X et les étiquettes y.
        """
        self.C = np.max(y) + 1  # Nombre de classes
        self.priors = compute_priors(X, y)  # Calcul des probabilités a priori pour chaque classe
        self.means = compute_means(X, y)    # Calcul des moyennes pour chaque classe
        self.cov = compute_sigma_lda(X, y, self.means)  # Calcul de la matrice de covariance partagée
    
    def compute_log_posterior(self, X):
        """
        Calcule les log-probabilités a posteriori pour chaque classe.
        """
        N, D = X.shape
        log_post = np.zeros((N, self.C))

        # Calcul des log-probabilités a posteriori pour chaque classe
        for i in range(self.C):
            mean_diff = X - self.means[i]  # Différence entre les données et les moyennes
            inv_cov = np.linalg.inv(self.cov)  # Inverse de la matrice de covariance
            log_det_cov = np.linalg.slogdet(self.cov)[1]  # Log du déterminant de la covariance
            log_prior = np.log(self.priors[i])  # Log de la probabilité a priori de la classe

            # Terme quadratique
            quad_term = np.sum(mean_diff @ inv_cov * mean_diff, axis=1)

            # Log-probabilité a posteriori
            log_post[:, i] = -0.5 * (quad_term + log_det_cov) + log_prior

        return log_post

In [72]:
sk_model = LinearDiscriminantAnalysis(store_covariance=True)
sk_model.fit(X_train, y_train)
sk_pred = sk_model.predict(X_train)

lda = LDA()
lda.fit(X_train, y_train)
pred = lda.predict(X_train)

assert (sk_pred == pred).all()
print("Accuracy scikit-learn : ", accuracy_score(y_train, sk_pred))
print("Your Accuracy : ", accuracy_score(y_train, pred))

Accuracy scikit-learn :  0.98
Your Accuracy :  0.98


In [73]:
class QDA(ProbClassifier):
    def __init__(self):
        self.priors = None
        self.means = None
        self.covariances = None
        self.C = None
    
    def fit(self, X, y):
        """
        Entraîne le classificateur QDA sur les données d'entraînement X et les étiquettes y.
        """
        self.C = np.max(y) + 1  # Nombre de classes
        self.priors = compute_priors(X, y)  # Calcul des probabilités a priori pour chaque classe
        self.means = compute_means(X, y)    # Calcul des moyennes pour chaque classe
        
        # Calcul des matrices de covariance pour chaque classe
        self.covariances = []
        for i in range(self.C):
            # Extraire les données correspondant à la classe i
            class_data = X[y == i]
            # Calcul de la matrice de covariance de la classe i
            cov = np.cov(class_data, rowvar=False)
            self.covariances.append(cov)
        
        self.covariances = np.array(self.covariances)  # Liste de matrices de covariance pour chaque classe
    
    def compute_log_posterior(self, X):
        """
        Calcule les log-probabilités a posteriori pour chaque classe.
        """
        N, D = X.shape
        log_post = np.zeros((N, self.C))
        
        for i in range(self.C):
            # Moyenne et covariance de la classe i
            mean_i = self.means[i]
            cov_i = self.covariances[i]
            prior_i = self.priors[i]
            
            # Calcul du terme quadratique
            mean_diff = X - mean_i  # Différence entre les données et la moyenne de la classe
            inv_cov = np.linalg.inv(cov_i)  # Inverse de la matrice de covariance
            log_det_cov = np.linalg.slogdet(cov_i)[1]  # Log du déterminant de la covariance
            
            # Terme quadratique : (X - μ_i)^T Σ_i^-1 (X - μ_i)
            quad_term = np.sum(mean_diff @ inv_cov * mean_diff, axis=1)
            
            # Log-probabilité a posteriori
            log_post[:, i] = -0.5 * (quad_term + log_det_cov) + np.log(prior_i)
        
        return log_post

In [74]:
sk_model = QuadraticDiscriminantAnalysis(store_covariance=True)
sk_model.fit(X_train, y_train)
sk_pred = sk_model.predict(X_train)

qda = QDA()
qda.fit(X_train, y_train)
pred = qda.predict(X_train)

assert (sk_pred == pred).all()
print("Accuracy scikit-learn : ", accuracy_score(y_train, sk_pred))
print("Your Accuracy : ", accuracy_score(y_train, pred))

Accuracy scikit-learn :  0.98
Your Accuracy :  0.98


In [75]:
sk_model = QuadraticDiscriminantAnalysis(store_covariance=True)
sk_model.fit(X_train, y_train)
sk_pred = sk_model.predict_proba(X_train)

qda = QDA()
qda.fit(X_train, y_train)
pred = qda.predict_proba(X_train)

error = rel_error(pred, sk_pred)
print(error)
assert error < 1e-12

1.5128081023261472e-14


# Naive Bayes Classifiers

##  Bernouilli Naive Bayes

In [76]:
data = load_digits()
X_train2, y_train2 = data.data, data.target
X_train2_transf = Binarizer().fit_transform(X_train2)

In [77]:
import numpy as np

class BernoulliNaiveBayes(ProbClassifier):
    def __init__(self):
        self.priors = None  # Probabilités a priori des classes
        self.C = None  # Nombre de classes
        self.theta = None  # Matrice des probabilités conditionnelles pour chaque caractéristique
    
    def fit(self, X, y):
        """
        Estime le paramètre theta pour chaque caractéristique et chaque classe.
        Aucune boucle sur les échantillons N, mais seulement sur le nombre de classes C.
        """
        N, D = X.shape  # N : nombre d'échantillons, D : nombre de caractéristiques
        self.C = np.max(y) + 1  # Calcul du nombre de classes
        self.priors = np.zeros(self.C)  # Initialisation des probabilités a priori
        self.theta = np.zeros((D, self.C))  # Matrice des probabilités pour chaque caractéristique et chaque classe
        
        # Calcul des probabilités a priori pour chaque classe
        for c in range(self.C):
            self.priors[c] = np.sum(y == c) / N  # P(class = c)
        
        # Calcul des probabilités conditionnelles pour chaque caractéristique et chaque classe
        for c in range(self.C):
            X_class = X[y == c]  # Sélectionner les échantillons de la classe c
            # Ajout d'un facteur de régularisation pour éviter les probabilités de 0
            self.theta[:, c] = (np.sum(X_class, axis=0) + 1) / (X_class.shape[0] + 2)  # +1 dans le numérateur et +2 dans le dénominateur
    
    def compute_log_posterior(self, X):
        """
        Calcule les log-posteriori pour chaque classe et chaque échantillon.
        """
        N, D = X.shape  # N : nombre d'échantillons, D : nombre de caractéristiques
        log_post = np.zeros((N, self.C))  # Matrice des log-postériori
        
        # Calcul des log-posteriori pour chaque classe
        for c in range(self.C):
            # Ajout d'un facteur de régularisation pour éviter les log(0)
            log_post[:, c] = np.log(self.priors[c]) + np.dot(X, np.log(self.theta[:, c])) + np.dot(1 - X, np.log(1 - self.theta[:, c]))
        
        return log_post
    


In [78]:
sk_model = BernoulliNB()
sk_model.fit(X_train2_transf, y_train2)
sk_pred = sk_model.predict(X_train2_transf)

model = BernoulliNaiveBayes()
model.fit(X_train2_transf, y_train2)
pred = model.predict(X_train2_transf)

sk_acc = accuracy_score(y_train2, sk_pred)
model_acc = accuracy_score(y_train2, pred)
print("Accuracy scikit-learn : ", sk_acc)
print("Your Accuracy : ", model_acc)
assert sk_acc - model_acc < 0.01

Accuracy scikit-learn :  0.8636616583194212
Your Accuracy :  0.8636616583194212


## Gaussian Naive Bayes

In [92]:
class GaussianNaiveBayes(ProbClassifier):
    def __init__(self):
        self.priors = None  # Probabilités a priori des classes
        self.C = None  # Nombre de classes
        self.mu = None  # Moyennes pour chaque classe
        self.sigma = None  # Variances pour chaque classe
    
    def fit(self, X, y):
        """
        Estime les paramètres mu et sigma pour chaque classe.
        Aucune boucle sur les échantillons N, mais seulement sur le nombre de classes C.
        """
        N, D = X.shape  # N : nombre d'échantillons, D : nombre de caractéristiques
        self.C = np.max(y) + 1  # Nombre de classes
        self.priors = compute_priors(X, y)  # Calcul des priors
        self.mu = compute_means(X, y)  # Calcul des moyennes
        self.sigma = np.zeros((D, self.C))  # Initialisation des variances
        
        # Calcul des variances pour chaque classe
        for c in range(self.C):
            X_class = X[y == c]  # Échantillons de la classe c
            self.sigma[:, c] = np.var(X_class, axis=0)  # Variance pour chaque caractéristique
    
    def compute_log_posterior(self, X):
        """
        Calcule les log-posteriori pour chaque classe et chaque échantillon.
        """
        N, D = X.shape  # N : nombre d'échantillons, D : nombre de caractéristiques
        log_post = np.zeros((N, self.C))  # Matrice des log-postériori
        
        # Calcul des log-posteriori pour chaque classe
        for c in range(self.C):
            # Calcul du terme log-likelihood
            log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * self.sigma[:, c])) - 0.5 * np.sum(((X - self.mu[:, c]) ** 2) / self.sigma[:, c], axis=1)
            log_post[:, c] = np.log(self.priors[c]) + log_likelihood
        
        return log_post



In [93]:
sk_model = GaussianNB()
sk_model.fit(X_train, y_train)
sk_pred = sk_model.predict(X_train)

model = GaussianNaiveBayes()
model.fit(X_train, y_train)

sk_acc = accuracy_score(y_train, sk_pred)
model_acc = accuracy_score(y_train, pred)
print("Accuracy scikit-learn : ", sk_acc)
print("Your Accuracy : ", model_acc)
assert sk_acc - model_acc < 0.01

Accuracy scikit-learn :  0.96
Your Accuracy :  0.96
