In [9]:
import numpy as np
import matplotlib.pyplot as plt

# Bernoulli naive Bayes

Run the below cell to get the following variables:

`X` = Data matrix of shape $(n, d)$. All the features are binary taking values $0$ or $1$.

`y` = label vector. Labels are $0$ and $1$.

In [2]:
rng = np.random.default_rng(seed=1)
X1 = np.concatenate((rng.binomial(size = 50,n = 1, p =0.7), rng.binomial(size = 50,n = 1, p =0.2))).reshape(-1, 1)
X2 = np.concatenate((rng.binomial(size = 50,n = 1, p =0.6), rng.binomial(size = 50,n = 1, p =0.1))).reshape(-1, 1)
X3 = np.concatenate((rng.binomial(size = 50,n = 1, p =0.6), rng.binomial(size = 50,n = 1, p =0.2))).reshape(-1, 1)
X4 = np.concatenate((rng.binomial(size = 50,n = 1, p =0.8), rng.binomial(size = 50,n = 1, p =0.1))).reshape(-1, 1)


X = np.column_stack((X1,X2,X3,X4))

y = np.concatenate((np.zeros(50, dtype= int), np.ones(50, dtype = int))).reshape(-1, 1)
permute = rng.permuted(range(100))

X = X[permute]
y = y[permute]


## Question 1
If we train the naive Bayes model on the dataset, What will be the value of $\hat{p}$, the estimate for $P(Y=1)$?



In [4]:
p = np.mean(y)
p

0.5

## Question 2
What will be the value of $\hat{p}_0^0$, the estimate of $P(f_0=1|y=0)$?  Write your answer correct to two decimal places.



In [37]:
def p_Xi_yj(X, y, i,j):
  dntr = np.count_nonzero(y == j)
  nmtr = np.count_nonzero((X[:,i] == 1) & (y.reshape(-1) == j))
  return nmtr/dntr

In [38]:
p_Xi_yj(X, y, 0,0)

0.68

## Question 3
What will be the value of $\hat{p}_0^1$, the estimate of $P(f_0=1|y=1)$?  Write your answer correct to two decimal places.



In [36]:
p_Xi_yj(X, y, 0,1)

0.26

## Question 4
What will be the value of $\hat{p}_3^1$, the estimate of $P(f_3=1|y=1)$?  Write your answer correct to two decimal places.




In [39]:
p_Xi_yj(X, y, 3,1)

0.12

## Question 5

What will be the predicted label for the point $[1, 0, 1, 0]$?



In [42]:
def predict(x_test, X, y):
  p_0, p_1  = 1, 1
  for i in range(x_test.shape[0]):
    if (x_test[i] == 1):
      p_0 = p_0 * p_Xi_yj(X, y, i,0)
      p_1 = p_1 * p_Xi_yj(X, y, i,1)
    else:
      p_0 = p_0 * (1-p_Xi_yj(X, y, i,0))
      p_1 = p_1 * (1-p_Xi_yj(X, y, i,1))
  if(p_1>p_0):
    return 1
  else:
    return 0

In [43]:
predict(np.array([1,0,1,0]), X, y)

1

## Question 6

What will be the predicted label for the point $[1, 0, 1, 1]$?



In [44]:
predict(np.array([1,0,1,1]), X, y)

0

# Gaussian naive Bayes

Run the below cell to get the following variables:

`X_train` = Training dataset of the shape $(n, d)$. All the examples are coming from multivariate gaussian distribution.

`y_train` = label vector for corresponding training examples. labels are $0$ and $1$.

`X_test` = Test dataset of the shape $(m, d)$, where $m$ is the number of examples in the test dataset. All the examples are coming from multivariate gaussian distribution.

`y_test` = label vector for corresponding test examples. labels are $0$ and $1$.



In [3]:
from sklearn.datasets import make_classification, make_blobs
from sklearn.model_selection import train_test_split

# generate artificial data points
X, y = make_blobs(n_samples = 100,
                  n_features=2,
                  centers=[[5,5],[10,10]],
                  cluster_std=1.5,
                  random_state=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=123)

## Question 7

How many examples are there in the trianing dataset?



In [51]:
n = X_train.shape[0]
n

80

## Question 8
How many features are there in the dataset?



In [52]:
d = X_train.shape[1]
d

2

## Question 9

If we train the Gaussian naive Bayes model on the trianing dataset, What will be the value of $\hat{p}$, the estimate for $P(Y=1)$? Write your answer correct to two decimal places.





In [58]:
p = np.mean(y_train)
p

0.4875

## Question 10

If $\hat{\mu}_0 = [\mu_1, \mu_2, ..., \mu_d]$ be the estimate for $\mu_0$, the mean of $0$ labeled examples, what will be the value of $\mu_1+\mu_2+...+\mu_d$? Write your answer correct to two decimal places.



In [10]:
def mu(X, y, label):
  return np.mean(X[y == label], axis=0)

In [65]:
mu_0 = mu(X_train,y_train,0)
np.sum(mu_0)

9.575936394688135

We will be using the different covariances for different labeled examples. The estimate for $\Sigma_k$ will be

$$\hat{\Sigma}_k = \sigma_iI$$ where $\sigma_i$ is the variance of $i^{th}$ feature values of examples labeled $k$.



## Question 11
What will be value of $\text{trace}({\hat{\Sigma}}_0)$?  Write your answer correct to two decimal places.







In [53]:
def sigma_i(X, y, label):
  sig = np.var(X[y == label],axis=0)
  return np.diag(sig)


In [54]:
np.trace(sigma_i(X_train, y_train, 0))

4.435204194501573

## Question 12

Once we have estimated all the parameters for Gaussian naive Bayes assuming the different covariance matrices, we predict the labels for the training examples. What will be the training accuracy?

Accuracy is defined as the proportion of correctly classified examples.  Write your answer correct to two decimal places.




In [34]:
def predict(X,y):
  p = np.mean(y)
  mu_0 = mu(X,y,0)
  mu_1 = mu(X,y,1)
  sigma_0 = sigma_i(X_train, y_train, 0)
  sigma_1 = sigma_i(X_train, y_train, 1)
  exp_0 = np.sum((X - mu_0) @ sigma_0 @ (X - mu_0).T, axis=1) * (1-p)
  exp_1 = np.sum((X - mu_1) @ sigma_1 @ (X - mu_1).T, axis=1) * p
  predictions = (exp_0 > exp_1).astype(int)
  return np.mean((predictions == y).astype(int))

In [35]:
predict(X_train,y_train)

0.9875

## Question 13

What will be the test accuracy?

Accuracy is defined as the proportion of correctly classified examples.  




In [36]:
predict(X_test,y_test)

1.0