In [1]:
import pandas as pd
import numpy as np

# set all the input here
input_file = "bayes - basic.xlsx"
input_cols = 'A:E'
target_col = 'Enrolls'
predict_row = ["<=30","Medium","Yes","Fair"]

In [2]:
# Read all training data from excel
df = pd.read_excel(input_file, usecols=input_cols)
# Remove the row with null target column
df[target_col] = df[target_col].str.strip().replace(r'^\s*$', np.nan, regex=True)

# Define feature list
features = [feature for feature in list(df.columns) if feature != target_col]

train_sample = df.dropna(subset=[target_col])
test_sample = []
if (predict_row is None or predict_row == 0):
    pass
else:
    test_sample = pd.Series([item.strip() for item in predict_row], index=features)

classes = train_sample[target_col].unique().tolist()

# calculate the distribution of target column
total_samples = len(df)
class_probs = {
    cls: df[df[target_col] == cls].shape[0]
    for cls in classes
}

feature_probs = {}
feature_config = {feature: df[feature].dropna().unique().tolist() for feature in features}

print(class_probs)
print(feature_config)

{'No': 5, 'Yes': 9}
{'Age': ['<=30', '31 to 40', '>40'], 'Income': ['High', 'Medium', 'Low'], 'JobSatisfaction': ['No', 'Yes'], 'Desire': ['Fair', 'Excellent']}


In [3]:
# check if apply laplace smoothing
# add 1 if there is 0 in the feature column
def apply_smoothing(counts, total, k):
    if (counts == 0).any():
        return counts + 1, total + k
    return counts, total

# Calculate the probability for each feature
for feature, possible_values in feature_config.items():
    k = len(possible_values)  # The number of possible values ​​a feature can take

    count_table = {}
    prob_table = {}
    feature_values = {}
    
    feature_values[feature] = df[feature].dropna().unique().tolist()
    for cls in classes:
        cls_data = df[df[target_col] == cls]
        counts = cls_data[feature].value_counts()
        counts = counts.reindex(feature_values[feature], fill_value=0)
        counts_smoothed, denominator = apply_smoothing(counts, class_probs[cls], k)
        count_table[cls] = counts_smoothed
        prob_table[cls] = (counts_smoothed / denominator).round(4)

    result_count = pd.DataFrame(count_table)
    result_prob = pd.DataFrame(prob_table)
    result_prob.columns = [col+"_prob" for col in result_prob.columns]
    feature_probs[feature] = result_prob
    print(f"Feature: {feature}")
    print(pd.concat([result_count,result_prob],axis=1))
    print("\n")
   

Feature: Age
          No  Yes  No_prob  Yes_prob
Age                                 
<=30       4    2    0.500    0.2222
31 to 40   1    4    0.125    0.4444
>40        3    3    0.375    0.3333


Feature: Income
        No  Yes  No_prob  Yes_prob
Income                            
High     2    2      0.4    0.2222
Medium   2    4      0.4    0.4444
Low      1    3      0.2    0.3333


Feature: JobSatisfaction
                 No  Yes  No_prob  Yes_prob
JobSatisfaction                            
No                4    3      0.8    0.3333
Yes               1    6      0.2    0.6667


Feature: Desire
           No  Yes  No_prob  Yes_prob
Desire                               
Fair        2    6      0.4    0.6667
Excellent   3    3      0.6    0.3333




In [4]:
def predict(sample):
    prob_dic = {cls: class_probs[cls]/sum(class_probs.values()) for cls in classes}
    for feature in features:
        value = sample[feature]
        feature_df = feature_probs[feature]
        # Get probabilities
        for cls in classes:
            prob_dic[cls] *= feature_df[cls+"_prob"][value]
            
    return prob_dic

# Prediction
prediction = predict(test_sample)

# ----------------- Outcome -----------------
print("Predicted value feature ")
print(test_sample.to_string())
print("\nPredicted result probability：")
for cls, prob in prediction.items():
    print(f"{cls}: {prob:.2%}")
print(f"\nFinal Predicton: {max(prediction, key=prediction.get)}")

Predicted value feature 
Age                  <=30
Income             Medium
JobSatisfaction       Yes
Desire               Fair

Predicted result probability：
No: 0.57%
Yes: 2.82%

Final Predicton: Yes
