# Gaussian Naive Bayes

In [23]:
# IMPORTING DATASET #
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

## View data

In [24]:
dataset_dict = {
    'Rainfall': [0.0, 2.0, 7.0, 18.0, 3.0, 3.0, 0.0, 1.0, 0.0, 25.0, 0.0, 18.0, 9.0, 5.0, 0.0, 1.0, 7.0, 0.0, 0.0, 7.0, 5.0, 3.0, 0.0, 2.0, 0.0, 8.0, 4.0, 4.0],
    'Temperature': [29.4, 26.7, 28.3, 21.1, 20.0, 18.3, 17.8, 22.2, 20.6, 23.9, 23.9, 22.2, 27.2, 21.7, 27.2, 23.3, 24.4, 25.6, 27.8, 19.4, 29.4, 22.8, 31.1, 25.0, 26.1, 26.7, 18.9, 28.9],
    'Humidity': [85.0, 90.0, 78.0, 96.0, 80.0, 70.0, 65.0, 95.0, 70.0, 80.0, 70.0, 90.0, 75.0, 80.0, 88.0, 92.0, 85.0, 75.0, 92.0, 90.0, 85.0, 88.0, 65.0, 70.0, 60.0, 95.0, 70.0, 78.0],
    'WindSpeed': [2.1, 21.2, 1.5, 3.3, 2.0, 17.4, 14.9, 6.9, 2.7, 1.6, 30.3, 10.9, 3.0, 7.5, 10.3, 3.0, 3.9, 21.9, 2.6, 17.3, 9.6, 1.9, 16.0, 4.6, 3.2, 8.3, 3.2, 2.2],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes']
}
df = pd.DataFrame(dataset_dict)

# Set feature matrix X and target vector y
X, y = df.drop(columns='Play'), df['Play']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=False)

In [25]:
df

Unnamed: 0,Rainfall,Temperature,Humidity,WindSpeed,Play
0,0.0,29.4,85.0,2.1,No
1,2.0,26.7,90.0,21.2,No
2,7.0,28.3,78.0,1.5,Yes
3,18.0,21.1,96.0,3.3,Yes
4,3.0,20.0,80.0,2.0,Yes
5,3.0,18.3,70.0,17.4,No
6,0.0,17.8,65.0,14.9,Yes
7,1.0,22.2,95.0,6.9,No
8,0.0,20.6,70.0,2.7,Yes
9,25.0,23.9,80.0,1.6,Yes


In [26]:
print(pd.concat([X_train, y_train], axis=1), end='\n\n')
print(pd.concat([X_test, y_test], axis=1))

    Rainfall  Temperature  Humidity  WindSpeed Play
0        0.0         29.4      85.0        2.1   No
1        2.0         26.7      90.0       21.2   No
2        7.0         28.3      78.0        1.5  Yes
3       18.0         21.1      96.0        3.3  Yes
4        3.0         20.0      80.0        2.0  Yes
5        3.0         18.3      70.0       17.4   No
6        0.0         17.8      65.0       14.9  Yes
7        1.0         22.2      95.0        6.9   No
8        0.0         20.6      70.0        2.7  Yes
9       25.0         23.9      80.0        1.6  Yes
10       0.0         23.9      70.0       30.3  Yes
11      18.0         22.2      90.0       10.9  Yes
12       9.0         27.2      75.0        3.0  Yes
13       5.0         21.7      80.0        7.5   No

    Rainfall  Temperature  Humidity  WindSpeed Play
14       0.0         27.2      88.0       10.3   No
15       1.0         23.3      92.0        3.0  Yes
16       7.0         24.4      85.0        3.9  Yes
17       0.

## Preprocessing data

In [27]:
from sklearn.preprocessing import PowerTransformer

# Initialize and fit the PowerTransformer
pt = PowerTransformer(standardize=True) # Standard Scaling already included
X_train_transformed = pt.fit_transform(X_train)
X_test_transformed = pt.transform(X_test)

## Step by step calculation

In [28]:
from fractions import Fraction

def calc_target_prob(attr):
    total_counts = attr.value_counts().sum()
    prob_series = attr.value_counts().apply(lambda x: Fraction(x, total_counts).limit_denominator())
    return prob_series

print(calc_target_prob(y_train))

Play
Yes    9/14
No     5/14
Name: count, dtype: object


In [29]:
def calculate_class_probabilities(X_train_transformed, y_train, feature_names):
    classes = y_train.unique()
    equations = pd.DataFrame(index=classes, columns=feature_names)

    for cls in classes:
        X_class = X_train_transformed[y_train == cls]
        mean = X_class.mean(axis=0)
        std = X_class.std(axis=0)
        k1 = 1 / (std * np.sqrt(2 * np.pi))
        k2 = 2 * (std ** 2)

        for i, column in enumerate(feature_names):
            equation = f"{k1[i]:.3f}·exp(-(x-({mean[i]:.2f}))²/{k2[i]:.3f})"
            equations.loc[cls, column] = equation

    return equations

# Use the function with the transformed training data
equation_table = calculate_class_probabilities(X_train_transformed, y_train, X.columns)

# Display the equation table
print(equation_table)

                           Rainfall                     Temperature  \
No   0.709·exp(-(x-(-0.35))²/0.633)   0.364·exp(-(x-(0.14))²/2.406)   
Yes   0.354·exp(-(x-(0.20))²/2.543)  0.428·exp(-(x-(-0.08))²/1.741)   

                           Humidity                       WindSpeed  
No    0.448·exp(-(x-(0.40))²/1.588)   0.488·exp(-(x-(0.43))²/1.337)  
Yes  0.403·exp(-(x-(-0.22))²/1.958)  0.394·exp(-(x-(-0.24))²/2.052)  


In [30]:
from scipy.stats import norm

def calculate_class_probability_products(X_train_transformed, y_train, X_new, feature_names, target_name):
    classes = y_train.unique()
    n_features = X_train_transformed.shape[1]

    # Create column names using actual feature names
    column_names = [target_name] + list(feature_names) + ['Product']

    probability_products = pd.DataFrame(index=classes, columns=column_names)

    for cls in classes:
        X_class = X_train_transformed[y_train == cls]
        mean = X_class.mean(axis=0)
        std = X_class.std(axis=0)

        prior_prob = np.mean(y_train == cls)
        probability_products.loc[cls, target_name] = prior_prob

        feature_probs = []
        for i, feature in enumerate(feature_names):
            prob = norm.pdf(X_new[0, i], mean[i], std[i])
            probability_products.loc[cls, feature] = prob
            feature_probs.append(prob)

        product = prior_prob * np.prod(feature_probs)
        probability_products.loc[cls, 'Product'] = product

    return probability_products

# Assuming X_new is your new sample reshaped to (1, n_features)
X_new = np.array([-1.28, 1.115, 0.84, 0.68]).reshape(1, -1)

# Calculate probability products
prob_products = calculate_class_probability_products(X_train_transformed, y_train, X_new, X.columns, y.name)

# Display the probability product table
print(prob_products)

         Play  Rainfall Temperature  Humidity WindSpeed   Product
No   0.357143  0.182877     0.24475  0.395431  0.465336  0.002941
Yes  0.642857  0.150136    0.189053  0.227162  0.261267  0.001083


## Build from scratch

In [31]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from fractions import Fraction

class NaiveBayesGaussian:
    def __init__(self):
        self.classes = None
        self.priors = {}  # Prior probabilities for each class
        self.mean = {}    # Mean for each feature per class
        self.std = {}     # Standard deviation for each feature per class

    def fit(self, X, y):
        """
        Fit the Naive Bayes model:
        - Calculate prior probabilities (P(C)).
        - Calculate mean and standard deviation for each feature per class.
        """
        self.classes = np.unique(y)

        for cls in self.classes:
            X_cls = X[y == cls]
            self.priors[cls] = len(X_cls) / len(X)  # Prior: P(C)
            self.mean[cls] = X_cls.mean(axis=0)     # Mean: μ for each feature
            self.std[cls] = X_cls.std(axis=0)       # Std deviation: σ for each feature

    def _gaussian_likelihood(self, x, mean, std):
        """
        Calculate Gaussian likelihood P(x|C) using the formula:
        P(x|C) = (1 / sqrt(2πσ²)) * exp(-(x - μ)² / (2σ²))-
        """
        eps = 1e-6  # To prevent division by zero
        coeff = 1 / np.sqrt(2 * np.pi * (std + eps)**2)
        exponent = np.exp(-((x - mean)**2) / (2 * (std + eps)**2))
        return coeff * exponent

    def _calculate_posterior(self, x):
        """
        Calculate the posterior probability P(C|x) for each class.
        """
        posteriors = {}
        for cls in self.classes:
            prior = np.log(self.priors[cls])  # Log(P(C))
            likelihood = np.sum(
                np.log(self._gaussian_likelihood(x, self.mean[cls], self.std[cls]))
            )
            posteriors[cls] = prior + likelihood
        return posteriors

    def predict(self, X):
        """
        Predict the class for each sample in X.
        """
        # Convert X to NumPy array with dtype=float
        X = X.to_numpy().astype(float)

        predictions = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            predictions.append(max(posteriors, key=posteriors.get))
        return np.array(predictions)


In [32]:
# Initialize Naive Bayes Classifier
nb = NaiveBayesGaussian()

# Fit the model
nb.fit(X_train, y_train)

# Predict
predictions = nb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))

# Predict the test set
y_pred = nb.predict(X_test)

# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate F1, Recall, and Precision
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.42857142857142855
Confusion Matrix:
[[3 2]
 [6 3]]

Classification Report:
              precision    recall  f1-score   support

          No       0.33      0.60      0.43         5
         Yes       0.60      0.33      0.43         9

    accuracy                           0.43        14
   macro avg       0.47      0.47      0.43        14
weighted avg       0.50      0.43      0.43        14



## Use model GaussianNB

In [33]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Initialize and train the Gaussian Naive Bayes model
gnb = GaussianNB()
gnb.fit(X_train_transformed, y_train)

# Make predictions on the test set
y_pred = gnb.predict(X_test_transformed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.6429


In [34]:
# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate F1, Recall, and Precision
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[2 3]
 [2 7]]

Classification Report:
              precision    recall  f1-score   support

          No       0.50      0.40      0.44         5
         Yes       0.70      0.78      0.74         9

    accuracy                           0.64        14
   macro avg       0.60      0.59      0.59        14
weighted avg       0.63      0.64      0.63        14



# Bernoulli Naive Bayes

In [43]:
dataset_dict_ber = {
    'Outlook': ['sunny', 'sunny', 'overcast', 'rain', 'rain', 'rain', 'overcast', 'sunny', 'sunny', 'rain', 'sunny', 'overcast', 'overcast', 'rain', 'sunny', 'overcast', 'rain', 'sunny', 'sunny', 'rain', 'overcast', 'rain', 'sunny', 'overcast', 'sunny', 'overcast', 'rain', 'overcast'],
    'Temperature': [85.0, 80.0, 83.0, 70.0, 68.0, 65.0, 64.0, 72.0, 69.0, 75.0, 75.0, 72.0, 81.0, 71.0, 81.0, 74.0, 76.0, 78.0, 82.0, 67.0, 85.0, 73.0, 88.0, 77.0, 79.0, 80.0, 66.0, 84.0],
    'Humidity': [85.0, 90.0, 78.0, 96.0, 80.0, 70.0, 65.0, 95.0, 70.0, 80.0, 70.0, 90.0, 75.0, 80.0, 88.0, 92.0, 85.0, 75.0, 92.0, 90.0, 85.0, 88.0, 65.0, 70.0, 60.0, 95.0, 70.0, 78.0],
    'Wind': [False, True, False, False, False, True, True, False, False, False, True, True, False, True, True, False, False, True, False, True, True, False, True, False, False, True, False, False],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes']
}
df_ber = pd.DataFrame(dataset_dict_ber)


In [36]:
df_ber

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play
0,sunny,85.0,85.0,False,No
1,sunny,80.0,90.0,True,No
2,overcast,83.0,78.0,False,Yes
3,rain,70.0,96.0,False,Yes
4,rain,68.0,80.0,False,Yes
5,rain,65.0,70.0,True,No
6,overcast,64.0,65.0,True,Yes
7,sunny,72.0,95.0,False,No
8,sunny,69.0,70.0,False,Yes
9,rain,75.0,80.0,False,Yes


In [44]:
# ONE-HOT ENCODE 'Outlook' COLUMN
df_ber = pd.get_dummies(df_ber, columns=['Outlook'],  prefix='', prefix_sep='', dtype=int)

# CONVERT 'Windy' (bool) and 'Play' (binary) COLUMNS TO BINARY INDICATORS
df_ber['Wind'] = df_ber['Wind'].astype(int)
df_ber['Play'] = (df_ber['Play'] == 'Yes').astype(int)

# Set feature matrix X and target vector y
X, y = df_ber.drop(columns='Play'), df_ber['Play']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=False)

print(pd.concat([X_train, y_train], axis=1), end='\n\n')
print(pd.concat([X_test, y_test], axis=1))

    Temperature  Humidity  Wind  overcast  rain  sunny  Play
0          85.0      85.0     0         0     0      1     0
1          80.0      90.0     1         0     0      1     0
2          83.0      78.0     0         1     0      0     1
3          70.0      96.0     0         0     1      0     1
4          68.0      80.0     0         0     1      0     1
5          65.0      70.0     1         0     1      0     0
6          64.0      65.0     1         1     0      0     1
7          72.0      95.0     0         0     0      1     0
8          69.0      70.0     0         0     0      1     1
9          75.0      80.0     0         0     1      0     1
10         75.0      70.0     1         0     0      1     1
11         72.0      90.0     1         1     0      0     1
12         81.0      75.0     0         1     0      0     1
13         71.0      80.0     1         0     1      0     0

    Temperature  Humidity  Wind  overcast  rain  sunny  Play
14         81.0      88

In [45]:
# One-hot encode the categorized columns and drop them after, but do it separately for training and test sets
# Define categories for 'Temperature' and 'Humidity' for training set
X_train['Temperature'] = pd.cut(X_train['Temperature'], bins=[0, 80, 100], labels=['Warm', 'Hot'])
X_train['Humidity'] = pd.cut(X_train['Humidity'], bins=[0, 75, 100], labels=['Dry', 'Humid'])

# Similarly, define for the test set
X_test['Temperature'] = pd.cut(X_test['Temperature'], bins=[0, 80, 100], labels=['Warm', 'Hot'])
X_test['Humidity'] = pd.cut(X_test['Humidity'], bins=[0, 75, 100], labels=['Dry', 'Humid'])

# One-hot encode the categorized columns
one_hot_columns_train = pd.get_dummies(X_train[['Temperature', 'Humidity']], drop_first=True, dtype=int)
one_hot_columns_test = pd.get_dummies(X_test[['Temperature', 'Humidity']], drop_first=True, dtype=int)

# Drop the categorized columns from training and test sets
X_train = X_train.drop(['Temperature', 'Humidity'], axis=1)
X_test = X_test.drop(['Temperature', 'Humidity'], axis=1)

# Concatenate the one-hot encoded columns with the original DataFrames
X_train = pd.concat([one_hot_columns_train, X_train], axis=1)
X_test = pd.concat([one_hot_columns_test, X_test], axis=1)

print(pd.concat([X_train, y_train], axis=1), '\n')
print(pd.concat([X_test, y_test], axis=1))

    Temperature_Hot  Humidity_Humid  Wind  overcast  rain  sunny  Play
0                 1               1     0         0     0      1     0
1                 0               1     1         0     0      1     0
2                 1               1     0         1     0      0     1
3                 0               1     0         0     1      0     1
4                 0               1     0         0     1      0     1
5                 0               0     1         0     1      0     0
6                 0               0     1         1     0      0     1
7                 0               1     0         0     0      1     0
8                 0               0     0         0     0      1     1
9                 0               1     0         0     1      0     1
10                0               0     1         0     0      1     1
11                0               1     1         1     0      0     1
12                1               0     0         1     0      0     1
13    

### Build from scrath Bernoullio NB

In [46]:
class NaiveBayesBernoulli:
    def __init__(self):
        self.classes = None
        self.priors = {}  # Prior probabilities for each class
        self.feature_probs = {}  # Feature probabilities P(x_i=1|C) for each class

    def fit(self, X, y):
        """
        Fit the Naive Bayes model:
        - Calculate prior probabilities (P(C)).
        - Calculate feature probabilities (P(x_i=1|C)).
        """
        self.classes = np.unique(y)

        for cls in self.classes:
            X_cls = X[y == cls]
            self.priors[cls] = len(X_cls) / len(X)  # Prior: P(C)
            self.feature_probs[cls] = (X_cls.sum(axis=0) + 1) / (len(X_cls) + 2)  # Laplace smoothing

    def _bernoulli_likelihood(self, x, feature_probs):
        """
        Calculate Bernoulli likelihood P(x|C) using:
        P(x|C) = p^x * (1-p)^(1-x)
        """
        likelihoods = (feature_probs**x) * ((1 - feature_probs)**(1 - x))
        return likelihoods

    def _calculate_posterior(self, x):
        """
        Calculate the posterior probability P(C|x) for each class.
        """
        posteriors = {}
        for cls in self.classes:
            prior = np.log(self.priors[cls])  # Log(P(C))
            likelihood = np.sum(np.log(self._bernoulli_likelihood(x, self.feature_probs[cls])))
            posteriors[cls] = prior + likelihood
        return posteriors

    def predict(self, X):
        """
        Predict the class for each sample in X.
        """
        predictions = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            predictions.append(max(posteriors, key=posteriors.get))
        return np.array(predictions)


In [47]:
# Initialize Bernoulli Naive Bayes Classifier
nb = NaiveBayesBernoulli()

# Train the model
nb.fit(X_train.values, y_train.values)

# Predict the test set
y_pred = nb.predict(X_test.values)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[4 1]
 [1 8]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.89      0.89      0.89         9

    accuracy                           0.86        14
   macro avg       0.84      0.84      0.84        14
weighted avg       0.86      0.86      0.86        14



### Use model Bernoullio NB

In [49]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import BernoulliNB

# Train the Bernoulli Naive Bayes model
nb_clf = BernoulliNB(alpha=1)
nb_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = nb_clf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[4 1]
 [1 8]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.89      0.89      0.89         9

    accuracy                           0.86        14
   macro avg       0.84      0.84      0.84        14
weighted avg       0.86      0.86      0.86        14

