In [24]:
import pandas as pd

class NaiveBayesClassifier:
    """
    A Naive Bayes classifier implementation with Laplace smoothing for categorical features.
    """
    
    def __init__(self, alpha=1.0):
        """
        Initializes the classifier with empty probability dictionaries and a smoothing parameter.
        
        Args:
            alpha (float): The smoothing parameter (default is 1.0 for Laplace smoothing).
        """
        self.alpha = alpha
        self.priors = {}
        self.conditionals = {}
        self.target_column = None
        self.features = []
        self.total_instances_by_class = {}
        self.unique_feature_values_count = {}

    def _calculate_priors(self, df: pd.DataFrame):
        """Calculates the prior probabilities for each class."""
        N = len(df)
        count_classes = df[self.target_column].value_counts()
        for label, count in count_classes.items():
            self.priors[label] = count / N
        
    def _calculate_conditionals(self, df: pd.DataFrame):
        """
        Calculates the conditional probabilities for each feature given a class, 
        applying Laplace smoothing.
        """
        class_labels = df[self.target_column].unique()
        self.features = [col for col in df.columns if col != self.target_column]
        
        # Store essential data for prediction
        self.total_instances_by_class = df[self.target_column].value_counts().to_dict()
        self.unique_feature_values_count = {feature: df[feature].nunique() for feature in self.features}

        for class_label in class_labels:
            self.conditionals[class_label] = {}
            df_class = df[df[self.target_column] == class_label]
            total_class_instances = len(df_class)

            for feature in self.features:
                feature_counts = df_class[feature].value_counts()
                num_unique_values = self.unique_feature_values_count[feature]
                
                self.conditionals[class_label][feature] = {}
                for value, count in feature_counts.items():
                    # Apply the smoothing formula for seen values
                    self.conditionals[class_label][feature][value] = \
                        (count + self.alpha) / (total_class_instances + self.alpha * num_unique_values)

                # Probability for unseen values within a specific class
                unseen_prob = self.alpha / (total_class_instances + self.alpha * num_unique_values)
                for unique_value in df[feature].unique():
                    if unique_value not in self.conditionals[class_label][feature]:
                        self.conditionals[class_label][feature][unique_value] = unseen_prob

    def fit(self, df: pd.DataFrame, target_column: str):
        """
        Trains the Naive Bayes classifier on the given DataFrame.
        
        Args:
            df: The training DataFrame.
            target_column: The name of the target/class column.
        """
        self.target_column = target_column
        self._calculate_priors(df)
        self._calculate_conditionals(df)
    
    def predict(self, new_instance: dict):
        """
        Predicts the class for a new instance.
        
        Args:
            new_instance: A dictionary with feature values to predict.
            
        Returns:
            The predicted class and all posterior probabilities.
        """
        posterior_probabilities = {}
        
        for class_label, prior_prob in self.priors.items():
            total_prob = prior_prob
            
            for feature in self.features:
                feature_value = new_instance.get(feature)
                
                # Retrieve the necessary information from class attributes
                total_class_instances = self.total_instances_by_class[class_label]
                num_unique_values = self.unique_feature_values_count[feature]

                # Get the conditional probability, handling unseen values with smoothing
                conditional_prob = self.conditionals[class_label][feature].get(
                    feature_value, 
                    self.alpha / (total_class_instances + self.alpha * num_unique_values)
                )
                total_prob *= conditional_prob
            
            posterior_probabilities[class_label] = total_prob
            
        predicted_class = max(posterior_probabilities, key=posterior_probabilities.get)
        
        return predicted_class, posterior_probabilities

In [25]:
# Load data
data = pd.read_csv('./data/sample_data.csv',index_col=0)
data

Unnamed: 0,outlook,temp,humidity,play_tennis
0,sunny,hot,high,no
1,sunny,hot,high,no
2,overcast,hot,high,yes
3,rainy,mild,high,yes
4,rainy,cool,normal,yes
5,rainy,cool,normal,no
6,overcast,cool,normal,yes
7,sunny,mild,high,no
8,sunny,cool,normal,yes
9,rainy,mild,normal,yes


In [26]:
# Create an instance of the classifier
naive = NaiveBayesClassifier(alpha=1.0)

# Train the model using the 'fit' method
naive.fit(data, 'play_tennis')

In [27]:
# Define a new instance to predict
new_day = {'outlook': 'overcast', 'temp': 'hot', 'humidity': 'normal'}

# Make the prediction
predicted_class, posterior_probs = naive.predict(new_day)

print("\nNew instance to predict:", new_day)
print("Posterior probabilities:", posterior_probs)
print("Predicted class:", predicted_class)


New instance to predict: {'outlook': 'overcast', 'temp': 'hot', 'humidity': 'normal'}
Posterior probabilities: {'yes': 0.04261363636363637, 'no': 0.004783163265306122}
Predicted class: yes
