In [131]:
import pandas as pd
import numpy as np

Bayes theorem states that 
$$
P(A | B) = \frac{P(A \cap Y)}{P(B)} = \frac{P(B|A) P(A)}{P(B)}\quad \Longrightarrow \quad \text{posterior} = \frac{\text{Class likelihood} \times \text{prior}}{\text{Evidence}}
$$
* $P(A \cap Y)$: the probability of $A$ and $B$
* $P(A|B)$: the probability of $A$ given $B$
* $P(B|A)$: the probability of $B$ given $A$
* P(A): the probability of $A$ occuring
* P(B): the probability of $B$ occuring

And, when two events are independent, 
$$
P(A \cap B) = P(A) \cdot P(B)
$$

For class variable $y$ and dependent feature vector $X$, we can apply Bayes theorem: 
$$
P(y|X) = \frac{P(X|y)P(y)}{P(X)}, \text{where } X = (x_1, x_2, x_3,...,x_n)
$$

The Naive Bayes approximation assumes that different feature dimensions (elements of $X$) are are conditionally independent. Applying this to our posterior probability: 
$$
P(y|x_1,...,x_n) = \frac{P(x_1|y)P(x_2|y)...P(x_3|y)P(y)}{P(x_1)P(x_2)...P(x_n)} 
$$
$$
P(y|x_1,...,x_n) \propto P(y) \prod_{j=1}^n P(x_j |y)
$$
For class label k, 
$$
P(y=k|x) \propto P(y=k) \prod_{j=1}^n P(x_j | y = k)
$$

We must now calculate model parameters ($\theta$'s) for each class probability $P(y = k)$ and each conditional-class probability $p(x_j = v | y = k)$. Begining the the simpler case, 
$$
\theta_k = P(y = k) = \frac{N_k + \alpha}{n+ \alpha \times K}
$$
* $N_k$: number of instances with label $k$
* $n$: number of training instances
* $K$: number of unique classes
* $\alpha$: Laplace smoothing parameter

To calculate class-conditional probabilities, 
$$
\theta_{k,j,v} = P(x_j = v_j | y = k) = \frac{N_{k,v_j} + \alpha}{N_k + \alpha \times V_j}
$$
* $N_{k,j.v}$: the number of times the value $v_j$ occurs in feature $x_j$ in training instances where the the target class is $k$
* $N_k$: the total count of all feature values where the target class is $k$
* $V_{j}$: the number distinct values of distinct values that feature $x_j$ can take
* $\alpha$: Laplace smoothing parameter
  
By setting $\alpha = 1$, we will apply Laplace smoothing to handle zero-frequency problems (when a word has not been observed in a class). 

Lastly, to make an inference/prediction about a new instance, we can define a Naive Bayes classifier by modifying our probability calculation for log space:
$$
P(y=k|x) \propto P(y=k) \prod_{j=1}^n P(x_j | y = k) 
$$
$$
\hat{y} = \underset{k \isin \{1,2,...,K \}  }{\text{argmax}} P(y=k) \prod_{j=1}^n P(x_j | y = k) 
$$
$$\boxed{\boxed{
\hat{y} = \underset{k \isin \{1,2,...,K \}  }{\text{argmax}} \log P(y=k) + \log \sum_{j=1}^n P(x_j | y = k) }}
$$
If we instead had a dataset of continous (not discrete) variables, we can apply a similar classification method: Gaussian Naive Bayes. To do this, we must first assume that each feature is normally (Gaussian) distributed. For each class, we can then calculate mean and variance of each feature. That is, for a dataset with features $x_1, x_2,...,x_n$ and classes $k_1, k_2,...,k_m$, for each feature $x_i$ in class $k_j$, we calculate mean $\mu_{ij}$ and variance $\sigma_{ij}^2$. For example, mean is the sum of all values of feature $x_i$ for the instances in class $k_j$ divided by the number of instances of in class $k_j$, These are calculated as:
$$
\mu_{ij} = \frac{\sum_{x \isin k_j} x_i }{N_{c_j}} \quad \text{and} \quad \sigma_{ij} = \frac{\sum_{x \isin k_j} \left( x_i - \mu_{ij} \right)^2 }{N_{c_j}} 
$$
We can then use the Gaussian probability density function to calculate the probability of observing the specific value $x$ for feature $x_i$ given that it belongs to class $k_j$. 
$$
P(x_i = x | k_j) = \frac{1}{\sqrt{2 \pi \sigma_{ij}^2}} \exp \left(- \frac{\left( x - \mu_{ij} \right)^2}{2\sigma_{ij}^2}\right)
$$

In [142]:
census_data = pd.read_csv("./datasets/1994_census_cleaned_train.csv")

target_class = "sex"
alpha = 1

class_frequencies = {}
training_instances = 0
for i in range(len(census_data)):
    training_instances += 1
    if census_data[target_class][i] in class_frequencies:
        class_frequencies[census_data[target_class][i]] += 1
    else:
        class_frequencies[census_data[target_class][i]] = 1

log_priors = {}
for key in class_frequencies:
    theta = (class_frequencies[key] + alpha) / (training_instances + alpha * len(class_frequencies))
    log_priors[key] = np.log(theta)

feature_frequencies = {}

for key in class_frequencies:
    feature_frequencies[key] = {}

column_headers = list(census_data.columns.values)
for i in range(len(census_data)):
    for j in range(len(column_headers)):
        if column_headers[j] != target_class:
            if census_data[column_headers[j]][i] in feature_frequencies[census_data[target_class][i]]:
                feature_frequencies[census_data[target_class][i]][census_data[column_headers[j]][i]] += 1
            else:
                feature_frequencies[census_data[target_class][i]][census_data[column_headers[j]][i]] = 1
                
feature_space_size = {}
for j in range(len(column_headers)):
    seen = []
    for i in range(len(census_data)):
        if census_data[column_headers[j]][i] not in seen:
            seen.append(census_data[column_headers[j]][i])
    feature_space_size[column_headers[j]] = len(seen)
    seen = []
    
class_conditional_probs = {}

# Initialize the class_conditional_probs structure
for class_value in class_frequencies.keys():
    class_conditional_probs[class_value] = {feature: {} for feature in census_data.columns if feature != target_class}

# Calculate class-conditional probabilities
for feature in census_data.columns:
    if feature != target_class:
        for class_value in class_frequencies.keys():
            total_count = sum(census_data[census_data[target_class] == class_value][feature].value_counts().values)
            V_j = feature_space_size[feature]
            for feature_value in census_data[feature].unique():
                N_k_vj = census_data[(census_data[target_class] == class_value) & (census_data[feature] == feature_value)].shape[0]
                theta_k_j_v = (N_k_vj + alpha) / (total_count + alpha * V_j)
                class_conditional_probs[class_value][feature][feature_value] = np.log(theta_k_j_v)
                
features = [col for col in census_data.columns if col != target_class]

def classify_NB(new_instance, log_priors, class_conditional_probs, features):
    class_probabilities = {}
    
    for class_value in log_priors.keys():
        log_prob = log_priors[class_value]
        for feature in features:
            feature_value = new_instance.get(feature)
            if feature_value and feature_value in class_conditional_probs[class_value][feature]:
                log_prob += class_conditional_probs[class_value][feature][feature_value]
        else:
            pass
        class_probabilities[class_value] = log_prob
    most_likely_class = max(class_probabilities, key=class_probabilities.get)
    return most_likely_class

In [143]:
new_instance = {
    'age': 'Senior',
    'workclass': 'Private',
    'education': 'HS-grad',
    'marital_status': 'Widowed',
    'occupation': 'Exec-managerial',
    'relationship': 'Not-in-family',
    'race': 'White',
    'sex': 'Female',
    'hours_per_week': 'Part-time',
    'income': '<=50K'
}

classify_NB(new_instance, log_priors, class_conditional_probs, features)

'Female'