<a href="https://colab.research.google.com/github/Kirans1ngh/Machine-Learning-practice/blob/main/Bayesian%20Classifiers/Bayesian_Classification_with_NumPy_and_scikit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# --- Dataset from the image ---
data = {
    'age': ['<=30', '<=30', '31...40', '>40', '>40', '>40', '31...40', '<=30', '<=30', '>40', '<=30', '31...40', '31...40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}
df = pd.DataFrame(data)

# --- Preprocessing ---
# Convert categorical data to numerical data
df_encoded = df.apply(LabelEncoder().fit_transform)

# Separate features (X) and target (y)
X = df_encoded.drop('buys_computer', axis=1).values
y = df_encoded['buys_computer'].values

# Split data into training and testing sets
# Note: With a small dataset, the split can heavily influence the result.
# For this demonstration, we'll train on the whole dataset to show the method.
X_train, y_train = X, y

# --- Custom Categorical Naive Bayes Classifier ---
class CategoricalNaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # Calculate class priors and feature probabilities
        self._priors = np.zeros(n_classes)
        # Using a list to store dictionaries for feature probabilities
        self._likelihoods = [{} for _ in range(n_features)]

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._priors[idx] = len(X_c) / n_samples

            for feature_i in range(n_features):
                # Get unique values and their counts for the current feature
                unique_vals, counts = np.unique(X_c[:, feature_i], return_counts=True)
                # Calculate probability for each unique value
                self._likelihoods[feature_i][c] = {
                    val: count / len(X_c) for val, count in zip(unique_vals, counts)
                }

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])

    def _predict_single(self, x):
        posteriors = []
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            # Laplace smoothing: Add 1 to numerator, num_categories to denominator
            # to handle cases where a feature value was not seen for a class.
            num_categories = [len(d[c]) for d in self._likelihoods]

            class_conditional = 0
            for feature_i, feature_val in enumerate(x):
                # Get the probability of this feature value given the class
                prob = self._likelihoods[feature_i][c].get(feature_val, 0)
                # Apply Laplace smoothing
                smoothed_prob = (prob * len(X_train[y_train == c]) + 1) / (len(X_train[y_train == c]) + len(np.unique(X_train[:, feature_i])))
                class_conditional += np.log(smoothed_prob)

            posterior = prior + class_conditional
            posteriors.append(posterior)

        return self._classes[np.argmax(posteriors)]

# --- Training and Prediction ---
nb_numpy = CategoricalNaiveBayes()
nb_numpy.fit(X_train, y_train)

# Let's create a new sample to predict
# Sample: age='<=30', income='medium', student='yes', credit_rating='fair'
# Encoded: age=1, income=2, student=1, credit_rating=1
new_sample = np.array([[1, 2, 1, 1]])
prediction = nb_numpy.predict(new_sample)

# Decode the prediction back to the original label ('yes' or 'no')
le_buys_computer = LabelEncoder().fit(df['buys_computer'])
predicted_label = le_buys_computer.inverse_transform(prediction)

print("--- Bayesian Classification with NumPy ---")
print(f"Sample to predict: age='<=30', income='medium', student='yes', credit_rating='fair'")
print(f"Predicted class: '{predicted_label[0]}'")

--- Bayesian Classification with NumPy ---
Sample to predict: age='<=30', income='medium', student='yes', credit_rating='fair'
Predicted class: 'yes'


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# --- Dataset from the image ---
data = {
    'age': ['<=30', '<=30', '31...40', '>40', '>40', '>40', '31...40', '<=30', '<=30', '>40', '<=30', '31...40', '31...40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}
df = pd.DataFrame(data)

# --- Preprocessing ---
# Convert categorical data to numerical data
df_encoded = df.apply(LabelEncoder().fit_transform)

# Separate features (X) and target (y)
X = df_encoded.drop('buys_computer', axis=1)
y = df_encoded['buys_computer']

# Split data into a training set and a small test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Training the Classifier ---
# Initialize the Categorical Naive Bayes model
cnb = CategoricalNB()

# Train the model
cnb.fit(X_train, y_train)

# --- Evaluation ---
# Make predictions on the test set
predictions = cnb.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

# Decode labels for the report
le_buys_computer = LabelEncoder().fit(df['buys_computer'])
y_test_labels = le_buys_computer.inverse_transform(y_test)
prediction_labels = le_buys_computer.inverse_transform(predictions)
target_names = le_buys_computer.classes_

print("--- Bayesian Classification with Scikit-learn ---")
print(f"Test Set Actual Labels:    {y_test_labels}")
print(f"Test Set Predicted Labels: {prediction_labels}\n")
print(f"Accuracy on the test set: {accuracy:.4f}\n")
print("Classification Report:")
# Use zero_division=0 to handle cases where a class has no test samples
print(classification_report(y_test, predictions, target_names=target_names, zero_division=0))

# --- Predicting a New Sample ---
# Sample: age='<=30', income='medium', student='yes', credit_rating='fair'
# We need to encode this sample using the same LabelEncoders
encoders = {col: LabelEncoder().fit(df[col]) for col in df.columns}
new_data = {
    'age': '<=30',
    'income': 'medium',
    'student': 'yes',
    'credit_rating': 'fair'
}
new_sample_encoded = [encoders[col].transform([new_data[col]])[0] for col in X.columns]

# Make the prediction
prediction_encoded = cnb.predict([new_sample_encoded])
predicted_label = le_buys_computer.inverse_transform(prediction_encoded)

print("\n--- Prediction on New Data ---")
print(f"Sample to predict: {new_data}")
print(f"Predicted class: 'buys_computer' = '{predicted_label[0]}'")


--- Bayesian Classification with Scikit-learn ---
Test Set Actual Labels:    ['no' 'yes' 'yes']
Test Set Predicted Labels: ['yes' 'yes' 'yes']

Accuracy on the test set: 0.6667

Classification Report:
              precision    recall  f1-score   support

          no       0.00      0.00      0.00         1
         yes       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3


--- Prediction on New Data ---
Sample to predict: {'age': '<=30', 'income': 'medium', 'student': 'yes', 'credit_rating': 'fair'}
Predicted class: 'buys_computer' = 'yes'


