## Naive Bayes Classifier

Naive Bayes Classifier model with Python

In [184]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.cluster import KMeans

# Read the dataset
df = pd.read_csv('lab_datasets/golf_dataset.csv')

df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play_golf
0,sunny,85.0,85.0,False,No
1,sunny,80.0,90.0,True,No
2,overcast,83.0,78.0,False,Yes
3,rain,70.0,96.0,False,Yes
4,rain,68.0,80.0,False,Yes
5,rain,65.0,70.0,True,No
6,overcast,64.0,65.0,True,Yes
7,sunny,72.0,95.0,False,No
8,sunny,69.0,70.0,False,Yes
9,rain,75.0,80.0,False,Yes


In [185]:
# Perform One-Hot Encoding on the 'Outlook' column
df = pd.get_dummies(df, columns=['Outlook'],  prefix='', prefix_sep='', dtype=int)

In [186]:
# Mapping the columns
df['Play_golf'] = df['Play_golf'].map({'Yes': 1, 'No': 0})
df['Wind'] = df['Wind'].map({True: 1, False: 0})

We need to divide the Humidity and Temperature column into four groups. We can use an unsupervised learning algorithm.

In [187]:
kmeans = KMeans(n_clusters=4, random_state=42)
df['Humidity'] = kmeans.fit_predict(df[['Humidity']])
df['Temperature'] = kmeans.fit_predict(df[['Temperature']])

Take 80% percent of the dataset for training and 20% for testing

In [None]:
# Take 80 percent of the dataset for training and 20% for testing
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

  len(df_train[df['Play_golf'] == 1])


13

In [None]:
n = len(df_train)
target = 'Play_golf'
prob_target = {}
# Find the probability
for i in df_train[target].unique():
    prob_target[str(i)] = len(df_train[df_train[target] == i]) / n

In [255]:
prob_feature_given_class = {}

for i in df_train.columns:
    # Skip the target feature
    if i == target:
        continue
    k = len(df_train[i].unique())  # number of unique values for smoothing
    # Compute the joint probabilities
    for j in df_train[i].unique():
        print(f"Feature {i}, Class {j}")
        for c in df_train[target].unique():
            subset = df_train[df_train[i] == j]
            m = len(subset[subset[target] == c])
            count_c = len(df_train[df_train[target] == c])
            # Apply Laplace smoothing
            prob_features = (m + 1) / (count_c + k)
            # Recompute the joint probability using smoothed conditional and class prior
            p = prob_features * prob_target[str(c)]
            prob_feature_given_class[(i, j, c)] = prob_features
            print(f"p({i} = {j} & {c}): {p}")
            print(f"p({i} = {j} | {c}): {prob_features}")
            print(f"p({i} ≠ {j} | {c}): {1 - prob_features}")
            print()

Feature Temperature, Class 3
p(Temperature = 3 & 0): 0.1258741258741259
p(Temperature = 3 | 0): 0.3076923076923077
p(Temperature ≠ 3 | 0): 0.6923076923076923

p(Temperature = 3 & 1): 0.10427807486631017
p(Temperature = 3 | 1): 0.17647058823529413
p(Temperature ≠ 3 | 1): 0.8235294117647058

Feature Temperature, Class 1
p(Temperature = 1 & 0): 0.06293706293706294
p(Temperature = 1 | 0): 0.15384615384615385
p(Temperature ≠ 1 | 0): 0.8461538461538461

p(Temperature = 1 & 1): 0.1737967914438503
p(Temperature = 1 | 1): 0.29411764705882354
p(Temperature ≠ 1 | 1): 0.7058823529411764

Feature Temperature, Class 0
p(Temperature = 0 & 0): 0.09440559440559441
p(Temperature = 0 | 0): 0.23076923076923078
p(Temperature ≠ 0 | 0): 0.7692307692307692

p(Temperature = 0 & 1): 0.20855614973262035
p(Temperature = 0 | 1): 0.35294117647058826
p(Temperature ≠ 0 | 1): 0.6470588235294117

Feature Temperature, Class 2
p(Temperature = 2 & 0): 0.1258741258741259
p(Temperature = 2 | 0): 0.3076923076923077
p(Tempera

In [None]:
predictions = []

for idx, row in df_test.iterrows():  # loop through each test record
    posterior = {}  # store posterior probabilities for each class

    for c in df_train[target].unique():  # e.g. Yes / No or 1 / 0
        # Start with prior P(Class=c)
        p_class = prob_target[str(c)]
        # Multiply by each conditional P(Feature=value | Class=c)
        for feature in df_train.columns:
            if feature == target:
                continue
            value = row[feature]
            if (feature, value, c) in prob_feature_given_class:
                p_class *= prob_feature_given_class[(feature, value, c)]
            else:
                # Handle unseen feature values using Laplace smoothing
                k = len(df_train[feature].unique())
                count_c = len(df_train[df_train[target] == c])
                p_class *= 1 / (count_c + k)
        posterior[c] = p_class

    # Normalize (optional for comparison, not required for argmax)
    total = sum(posterior.values())
    for c in posterior:
        posterior[c] /= total

    # Get predicted class (highest posterior)
    predicted_class = max(posterior, key=posterior.get)
    predictions.append(predicted_class)
    print(f"Test sample {idx}: {posterior} → Predicted: {predicted_class}")

# Attach predictions to your test DataFrame
df_test['Predicted'] = predictions

Test sample 9: {np.int64(0): 0.35641107712591363, np.int64(1): 0.6435889228740863} → Predicted: 1
Test sample 25: {np.int64(0): 0.16482196214715797, np.int64(1): 0.8351780378528421} → Predicted: 1
Test sample 8: {np.int64(0): 0.7392702288509299, np.int64(1): 0.26072977114907014} → Predicted: 0
Test sample 21: {np.int64(0): 0.4247525930954888, np.int64(1): 0.5752474069045111} → Predicted: 1
Test sample 0: {np.int64(0): 0.4596395569998134, np.int64(1): 0.5403604430001866} → Predicted: 1
Test sample 12: {np.int64(0): 0.05337593677869286, np.int64(1): 0.9466240632213072} → Predicted: 1


In [260]:
df_test

Unnamed: 0,Temperature,Humidity,Wind,Play_golf,overcast,rain,sunny,Predicted
9,0,2,0,1,0,1,0,1
25,3,1,1,1,1,0,0,1
8,2,0,0,1,0,0,1,0
21,0,1,0,1,0,1,0,1
0,1,1,0,0,0,0,1,1
12,3,2,0,1,1,0,0,1
