## 3 Logistic Regression versus Bayes Classifier
### Student ID: 35224436 | Full name: Yiming Zhang

## Task I. The Bayesian Classifier

### 1. Data preparation
Load the data and output some descriptive information about the data

In [None]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer(as_frame=True)

X = data.data
y = data.target
print(X.shape)
print(y.shape)
# print the first 5 rows of the data
X.head()

Data splitting with train_size=0.8

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### 2. Bayesian Classifier 


In [None]:
from scipy.stats import multivariate_normal
import numpy as np


class BayesianClassifier:
    def __init__(self, shared_cov=True, cond_ind=True):
        self.shared_cov = shared_cov  # whether to share the covariance matrix
        self.cond_ind = cond_ind  # whether to assume conditional independence

    def fit(self, x, y):
        # get the classes and their counts
        self.classes_, class_counts = np.unique(y, return_counts=True)
        self.n_, self.p_ = x.shape  # get sample number and feature number
        self.k_ = len(self.classes_)  # [0, 1, 2, ..., k-1]

        # initialize the conditional means and covariance matrices
        self.cond_means_ = np.zeros(shape=(self.k_, self.p_))
        self.cond_covs_ = np.zeros(shape=(self.k_, self.p_, self.p_))

        # calculate the prior probabilities
        self.class_priors_ = class_counts / len(y)

        # calculate the conditional means and covariance matrices for each class
        for c in range(self.k_):
            c_rows = y == c

            self.cond_means_[c, :] = x[c_rows].mean(axis=0)

            if self.cond_ind:
                # conditional independence -> diagonal matrix
                np.fill_diagonal(self.cond_covs_[c, :, :], x[c_rows].var(axis=0))
            else:
                self.cond_covs_[c, :, :] = np.cov(x[c_rows].T, bias=True)

        if self.shared_cov:
            # calculate the shared covariance matrix
            # weighted average of the covariance matrices of each class
            shared_cov = np.moveaxis(self.cond_covs_, 0, -1).dot(self.class_priors_)
            self.cond_covs_[:] = shared_cov

        return self

    def predict_proba(self, x):
        m, _ = x.shape
        cond_probs = np.zeros(shape=(m, self.k_))
        for c in range(self.k_):
            # find p(x | c_k)
            # singular covariance matrices could happen (e.g., through inaccurate estimation)
            cond_probs[:, c] = multivariate_normal.pdf(
                x, self.cond_means_[c], self.cond_covs_[c], allow_singular=True
            )
        # find marginal probabilities p(x) by summing all the conditionals weighted by the priors
        marginal_probs = cond_probs.dot(self.class_priors_)

        # find probability vector (p(c1 | x), ..., p(ck | x)) via p(ci | x)=p(x | ci) / p(x)
        # however, p(x) might have been rounded to 0
        # thus, compute via case distinction
        probs = np.divide(
            (cond_probs * self.class_priors_).T,
            marginal_probs,
            where=marginal_probs > 0,
            out=np.zeros(shape=(self.k_, m)),
        ).T
        return probs

    def predict(self, x):
        return np.argmax(self.predict_proba(x), axis=1)

    def decision_function(self, x):
        probs = self.predict_proba(x)
        if self.k_ == 2:
            return np.log(probs[:, 1] / probs[:, 0])
        else:
            res = np.zeros(len(x), self.k_)
            for c in range(self.k_):
                res[:, c] = np.log(probs[:, c] / (1 - probs[:, c]))
            return res

    def generate(self, n, c, random_state=None):
        return multivariate_normal.rvs(
            self.cond_means_[c], self.cond_covs_[c], size=n, random_state=random_state
        )

### 3. Variants of Bayesian Classifiers
In this section, we define three types of Bayesian classifiers: the Naive Bayes variant (without shared covariance), as well as the variants with full covariance (both shared and not shared).

In [None]:
# Naive Bayes variant
naive_bayes = BayesianClassifier(shared_cov=False, cond_ind=True)
# full covariance variant with shared covariance
full_cov_shared = BayesianClassifier(shared_cov=True, cond_ind=False)
# full covariance variant with not shared covariance
full_cov_not_shared = BayesianClassifier(shared_cov=False, cond_ind=False)

### 4. Training Process
Use the training set obtained in the first step to train the three variants of Bayesian classifiers and the logistic regression model.

In [None]:
from sklearn.linear_model import LogisticRegression

# Naive Bayes variant
naive_bayes.fit(X_train, y_train)
# full covariance variant with shared covariance
full_cov_shared.fit(X_train, y_train)
# full covariance variant with not shared covariance
full_cov_not_shared.fit(X_train, y_train)
# logistic regression model
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

### 5. Evaulation Methods
