In [98]:
import kagglehub
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from math import sqrt, pi, exp

# Download latest version
folder_path = kagglehub.dataset_download("dheemanthbhat/simple-weather-forecast")

print("Path to dataset files:", folder_path)
print(os.listdir(folder_path))

Path to dataset files: /Users/7n100489/.cache/kagglehub/datasets/dheemanthbhat/simple-weather-forecast/versions/1
['weather_forecast.csv']


In [99]:
csv_path = os.path.join(folder_path, 'weather_forecast.csv')
df = pd.read_csv(csv_path)

temp_val = 'TEMP_SWAP'

df['Outlook'] = df['Outlook'].replace('Sunny', temp_val)
df['Outlook'] = df['Outlook'].replace('Rain', 'Sunny')
df['Outlook'] = df['Outlook'].replace(temp_val, 'Rain')
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Rain,Hot,High,Weak,No
1,Rain,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Sunny,Mild,High,Weak,Yes
4,Sunny,Cool,Normal,Weak,Yes
5,Sunny,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Rain,Mild,High,Weak,No
8,Rain,Cool,Normal,Weak,Yes
9,Sunny,Mild,Normal,Weak,Yes


# NaiveBayesClassifier From Scratch

In [100]:
# --- NaiveBayesClassifier From Scratch ---
class NaiveBayesClassifier:
    def __init__(self, categorical_features, numerical_features, target_feature, use_add1_smoothing=True):
        self.categorical_features = categorical_features
        self.numerical_features = numerical_features
        self.target_feature = target_feature
        self.use_add1_smoothing = use_add1_smoothing

        self.classes = []
        self.prior_probs = {} # P(Class)
        self.likelihoods_categorical = {} # P(Feature_cat | Class) with smoothing
        self.numerical_features_stats = {} # {Class: {Feature_num: {'mean': .., 'std': ..}}}
        self.feature_possible_values = {} # For smoothing denominator

    def fit(self, df):
        """
        ฝึกโมเดล Naive Bayes โดยคำนวณ Prior, Likelihoods, และ Stats สำหรับ Numerical Features
        **Likelihoods สำหรับ Categorical Predictors จะไม่ใช้ Add-1 Smoothing**
        """
        self.classes = df[self.target_feature].unique()
        total_samples = len(df)

        # 1. คำนวณ Prior Probabilities
        for cls in self.classes:
            self.prior_probs[cls] = df[df[self.target_feature] == cls].shape[0] / total_samples

        # 2. คำนวณ Likelihoods สำหรับ Categorical Predictors (ไม่มี Smoothing)
        for feature in self.categorical_features:
            self.feature_possible_values[feature] = df[feature].nunique() # ไม่ได้ใช้แล้วถ้าไม่ smoothing
            for cls in self.classes:
                if cls not in self.likelihoods_categorical:
                    self.likelihoods_categorical[cls] = {}
                self.likelihoods_categorical[cls][feature] = {}

                df_class = df[df[self.target_feature] == cls]
                total_count_class = df_class.shape[0]

                for feat_value in df[feature].unique():
                    count_feat_class = df_class[feature].value_counts().get(feat_value, 0)

                    # *** จุดที่แก้ไข 2: ใช้เงื่อนไข use_add1_smoothing ***
                    if self.use_add1_smoothing:
                        prob = (count_feat_class + 1) / (total_count_class + self.feature_possible_values[feature])
                    else:
                        if total_count_class > 0:
                            prob = count_feat_class / total_count_class
                        else:
                            prob = 0.0 # ถ้าไม่มีข้อมูลในคลาสนี้เลย ให้ Likelihood เป็น 0

                    self.likelihoods_categorical[cls][feature][feat_value] = prob

        # 3. คำนวณ Mean และ Standard Deviation สำหรับ Numerical Predictors
        for feature in self.numerical_features:
            for cls in self.classes:
                if cls not in self.numerical_features_stats:
                    self.numerical_features_stats[cls] = {}

                df_class_feature = df[df[self.target_feature] == cls][feature]
                self.numerical_features_stats[cls][feature] = {
                    'mean': df_class_feature.mean(),
                    'std': df_class_feature.std(ddof=1) # ddof=1 for sample std dev
                }
                # Handle cases where std dev is 0 (e.g., all values are same in a class)
                if self.numerical_features_stats[cls][feature]['std'] == 0:
                    self.numerical_features_stats[cls][feature]['std'] = 1e-9 # Prevent division by zero, small epsilon

    def _normal_pdf(self, x, mean, std):
        """Probability Density Function for Normal Distribution."""
        if std == 0: # Should be handled by epsilon in fit, but as a safeguard
            return 1.0 if x == mean else 0.0
        exponent = exp(-((x - mean) ** 2) / (2 * (std ** 2)))
        return (1 / (sqrt(2 * pi) * std)) * exponent

    def _calculate_unnormalized_posterior_with_terms(self, query_features, target_label):
        """
        คำนวณ Numerator (Score) สำหรับคลาสที่ระบุ พร้อมส่งกลับเทอมการคูณแต่ละตัว
        Score = P(X|c) * P(c)
        """
        score = self.prior_probs[target_label]
        terms = [f"{self.prior_probs[target_label]:.4f}"] # เก็บเฉพาะตัวเลขสำหรับส่วน (0.XX * 0.YY...)

        # สำหรับแสดง P(Yes) = ... หรือ P(No) = ... ในบรรทัดแรกของ P(X|c)*P(c)
        detailed_terms_display = [f"P({target_label}) = {self.prior_probs[target_label]:.4f}"]

        for feat_name, feat_value in query_features.items():
            if feat_name in self.categorical_features:
                p_feat_given_label = self.likelihoods_categorical[target_label][feat_name].get(feat_value, 0)
                score *= p_feat_given_label

                display_value = feat_value
                terms.append(f"{p_feat_given_label:.4f}")
                detailed_terms_display.append(f"P({feat_name}={display_value}|{target_label}) = {p_feat_given_label:.4f}")
            elif feat_name in self.numerical_features:
                mean = self.numerical_features_stats[target_label][feat_name]['mean']
                std = self.numerical_features_stats[target_label][feat_name]['std']
                p_feat_given_label = self._normal_pdf(feat_value, mean, std)
                score *= p_feat_given_label
                terms.append(f"{p_feat_given_label:.4f}")
                detailed_terms_display.append(f"PDF({feat_name}={feat_value}|{target_label}) = {p_feat_given_label:.4f}")
            else:
                # This should ideally not happen if features are correctly defined
                print(f"Warning: Feature '{feat_name}' not recognized during prediction. Skipping.")
        return score, terms, detailed_terms_display

    def predict_proba(self, query_features):
        """
        ทำนาย Posterior Probabilities สำหรับแต่ละคลาส
        คืนค่าเป็น dictionary {class: probability}
        """
        scores = {}
        # We don't need detailed terms for predict_proba, only for display
        for cls in self.classes:
            score, _, _ = self._calculate_unnormalized_posterior_with_terms(query_features, cls)
            scores[cls] = score

        total_score = sum(scores.values())

        posterior_probs = {}
        if total_score > 0:
            for cls in self.classes:
                posterior_probs[cls] = scores[cls] / total_score
        else:
            for cls in self.classes:
                posterior_probs[cls] = 0.0
            print("Warning: All unnormalized scores are zero. Posterior probabilities set to 0.0.")

        return posterior_probs

    def predict(self, query_features):
        """
        ทำนายคลาสที่มีความน่าจะเป็นสูงสุด
        """
        posterior_probs = self.predict_proba(query_features)

        if posterior_probs:
            return max(posterior_probs, key=posterior_probs.get)
        else:
            return None

In [101]:
# 1. กำหนด Features และ Target
categorical_feats = ['Outlook', 'Temperature', 'Humidity', 'Windy']
numerical_feats = []
target_feat = 'Play'

# 2. fit model
nb_model = NaiveBayesClassifier(categorical_feats, numerical_feats, target_feat, use_add1_smoothing=False)
nb_model.fit(df)

## Step 1 : Calculate Prior Probability of Classes P(y)

In [102]:
# Step 1 : Calculate Prior Probability of Classes P(C)
print(f"\nP(Yes) = {nb_model.prior_probs['Yes']:.4f}")
print(f"P(No) = {nb_model.prior_probs['No']:.4f}")


P(Yes) = 0.6429
P(No) = 0.3571


## Step 2 : Calculate the Likelihood Table for all features

In [103]:
# Step 2 : Calculate the Likelihood Table for all features

total_yes_samples = df[df[target_feat] == 'Yes'].shape[0]
total_no_samples = df[df[target_feat] == 'No'].shape[0]

for feature_name in categorical_feats:
    print(f"\n=== Likelihood Table for {feature_name.capitalize()} ===")
    num_unique_feature_values = df[feature_name].nunique()
    for feat_value in sorted(df[feature_name].unique()):
        display_feat_value = feat_value

        count_feat_class_yes = df[(df[target_feat] == 'Yes') & (df[feature_name] == feat_value)].shape[0]
        count_feat_class_no = df[(df[target_feat] == 'No') & (df[feature_name] == feat_value)].shape[0]

        denominator_yes = total_yes_samples + num_unique_feature_values
        denominator_no = total_no_samples + num_unique_feature_values

        if nb_model.use_add1_smoothing:
            fraction_yes_str = f"{count_feat_class_yes + 1}/{total_yes_samples + num_unique_feature_values}"
            fraction_no_str = f"{count_feat_class_no + 1}/{total_no_samples + num_unique_feature_values}"
        else: # กรณีไม่มี smoothing
            fraction_yes_str = f"{count_feat_class_yes}/{total_yes_samples}" if total_yes_samples > 0 else "0/0"
            fraction_no_str = f"{count_feat_class_no}/{total_no_samples}" if total_no_samples > 0 else "0/0"

        yes_prob = nb_model.likelihoods_categorical['Yes'][feature_name].get(feat_value, 0)
        no_prob = nb_model.likelihoods_categorical['No'][feature_name].get(feat_value, 0)

        print(f"{display_feat_value:9s} | P({display_feat_value:9s} | Yes) : {fraction_yes_str:<6s} = {yes_prob:.2f} | P({display_feat_value:9s} | No) : {fraction_no_str:<6s} = {no_prob:.2f}")


=== Likelihood Table for Outlook ===
Overcast  | P(Overcast  | Yes) : 4/9    = 0.44 | P(Overcast  | No) : 0/5    = 0.00
Rain      | P(Rain      | Yes) : 2/9    = 0.22 | P(Rain      | No) : 3/5    = 0.60
Sunny     | P(Sunny     | Yes) : 3/9    = 0.33 | P(Sunny     | No) : 2/5    = 0.40

=== Likelihood Table for Temperature ===
Cool      | P(Cool      | Yes) : 3/9    = 0.33 | P(Cool      | No) : 1/5    = 0.20
Hot       | P(Hot       | Yes) : 2/9    = 0.22 | P(Hot       | No) : 2/5    = 0.40
Mild      | P(Mild      | Yes) : 4/9    = 0.44 | P(Mild      | No) : 2/5    = 0.40

=== Likelihood Table for Humidity ===
High      | P(High      | Yes) : 3/9    = 0.33 | P(High      | No) : 4/5    = 0.80
Normal    | P(Normal    | Yes) : 6/9    = 0.67 | P(Normal    | No) : 1/5    = 0.20

=== Likelihood Table for Windy ===
Strong    | P(Strong    | Yes) : 3/9    = 0.33 | P(Strong    | No) : 3/5    = 0.60
Weak      | P(Weak      | Yes) : 6/9    = 0.67 | P(Weak      | No) : 2/5    = 0.40


### 3. Set query

In [104]:
query = {
    'Outlook': 'Rain',
    'Temperature': 'Cool',
    'Humidity': 'High',
    'Windy': 'Strong'
}

### 4. Posterior Probabilities (Unnormalized)

In [105]:
# 4. คำนวณและแสดง Posterior Probabilities (Unnormalized)
print("\n--- Posterior Probability for 'Yes' ---")
score_yes, terms_numerical_yes, terms_display_yes = nb_model._calculate_unnormalized_posterior_with_terms(query, 'Yes')

print(f"{' | '.join(terms_display_yes)}")
# Display with 4 decimal places for consistency
formatted_numerical_terms_yes = [f"{float(term):.4f}" for term in terms_numerical_yes]
print(f"P(X|Yes) * P(Yes) = ({' * '.join(formatted_numerical_terms_yes)})")
print(f"P(X|Yes) * P(Yes) = {score_yes:.4f}")


print("\n--- Posterior Probability for 'No' ---")
score_no, terms_numerical_no, terms_display_no = nb_model._calculate_unnormalized_posterior_with_terms(query, 'No')

print(f"{' | '.join(terms_display_no)}")
# Display with 4 decimal places for consistency
formatted_numerical_terms_no = [f"{float(term):.4f}" for term in terms_numerical_no]
print(f"P(X|No) * P(No) = ({' * '.join(formatted_numerical_terms_no)})")
print(f"P(X|No) * P(No) = {score_no:.4f}")


--- Posterior Probability for 'Yes' ---
P(Yes) = 0.6429 | P(Outlook=Rain|Yes) = 0.2222 | P(Temperature=Cool|Yes) = 0.3333 | P(Humidity=High|Yes) = 0.3333 | P(Windy=Strong|Yes) = 0.3333
P(X|Yes) * P(Yes) = (0.6429 * 0.2222 * 0.3333 * 0.3333 * 0.3333)
P(X|Yes) * P(Yes) = 0.0053

--- Posterior Probability for 'No' ---
P(No) = 0.3571 | P(Outlook=Rain|No) = 0.6000 | P(Temperature=Cool|No) = 0.2000 | P(Humidity=High|No) = 0.8000 | P(Windy=Strong|No) = 0.6000
P(X|No) * P(No) = (0.3571 * 0.6000 * 0.2000 * 0.8000 * 0.6000)
P(X|No) * P(No) = 0.0206


### 5. Normalized Posterior Probabilities

In [106]:
# 5. Final Decision (Normalized Posterior Probabilities)
total_unnormalized_score = score_yes + score_no

posterior_yes_normalized = (score_yes / total_unnormalized_score) * 100 if total_unnormalized_score > 0 else 0
posterior_no_normalized = (score_no / total_unnormalized_score) * 100 if total_unnormalized_score > 0 else 0

print("\n--- Final Decision ---")
print(f"P(Play=Yes | x) = {score_yes:.4f} ({posterior_yes_normalized:.2f}%)")
print(f"P(Play=No  | x) = {score_no:.4f} ({posterior_no_normalized:.2f}%)")


--- Final Decision ---
P(Play=Yes | x) = 0.0053 (20.46%)
P(Play=No  | x) = 0.0206 (79.54%)


# NaiveBayesClassifier From SKlearn

In [107]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

# --- Load data ---
target = 'Play'

# --- One-hot encoding ---
le_y = LabelEncoder()
X = pd.get_dummies(df.drop(columns=[target]))
y = le_y.fit_transform(df[target])

# --- Train MultinomialNB ---
alpha = 1e-10 # no add-1 smothing (but if alpha = 1 , use add-1 smothing)
model = MultinomialNB(alpha=alpha)
model.fit(X, y)

# --- Extract Likelihood Table ---
class_labels = le_y.inverse_transform(np.arange(len(le_y.classes_)))
features = X.columns
class_counts = np.bincount(y)
n_features = X.shape[1]

print("=== Likelihood Table ===")
for feat in features:
    feat_name, val_name = feat.split('_', 1)
    idx = X.columns.get_loc(feat)
    for ci, cls in enumerate(class_labels):
        count = model.feature_count_[ci, idx]
        total = class_counts[ci]
        # ความน่าจะเป็นแบบ smoothing
        prob_smooth = (count + alpha) / (total + alpha * n_features)
        print(f"{feat_name}={val_name} | P({val_name} | {cls}) : ({int(count)} + {alpha})/({int(total)} + {alpha}*{n_features}) = {prob_smooth:.4f}")
    print('-'*50)


=== Likelihood Table ===
Outlook=Overcast | P(Overcast | No) : (0 + 1e-10)/(5 + 1e-10*10) = 0.0000
Outlook=Overcast | P(Overcast | Yes) : (4 + 1e-10)/(9 + 1e-10*10) = 0.4444
--------------------------------------------------
Outlook=Rain | P(Rain | No) : (3 + 1e-10)/(5 + 1e-10*10) = 0.6000
Outlook=Rain | P(Rain | Yes) : (2 + 1e-10)/(9 + 1e-10*10) = 0.2222
--------------------------------------------------
Outlook=Sunny | P(Sunny | No) : (2 + 1e-10)/(5 + 1e-10*10) = 0.4000
Outlook=Sunny | P(Sunny | Yes) : (3 + 1e-10)/(9 + 1e-10*10) = 0.3333
--------------------------------------------------
Temperature=Cool | P(Cool | No) : (1 + 1e-10)/(5 + 1e-10*10) = 0.2000
Temperature=Cool | P(Cool | Yes) : (3 + 1e-10)/(9 + 1e-10*10) = 0.3333
--------------------------------------------------
Temperature=Hot | P(Hot | No) : (2 + 1e-10)/(5 + 1e-10*10) = 0.4000
Temperature=Hot | P(Hot | Yes) : (2 + 1e-10)/(9 + 1e-10*10) = 0.2222
--------------------------------------------------
Temperature=Mild | P(Mi

In [108]:
# แปลง query เป็น one-hot vector ให้ตรงกับ X.columns
def query_to_vector(query, feature_columns):
    vec_dict = {col:0 for col in feature_columns}
    for feat, val in query.items():
        col_name = f"{feat}_{val}"
        if col_name in vec_dict:
            vec_dict[col_name] = 1
        else:
            raise ValueError(f"Value '{val}' for feature '{feat}' not found in training data.")
    return pd.DataFrame([vec_dict])

X_query = query_to_vector(query, X.columns)

# --- Predict probabilities ---
probs = model.predict_proba(X_query)[0]  # จะได้ array ของความน่าจะเป็นแต่ละ class
classes = le_y.inverse_transform(np.arange(len(probs)))

print('--- Final Decision ---')
for c, p in zip(classes, probs):
    print(f"P(Play={c} | x) = {p:.4f} ({p*100:.2f}%)")

--- Final Decision ---
P(Play=No | x) = 0.7954 (79.54%)
P(Play=Yes | x) = 0.2046 (20.46%)


# Homework

1. จาก Naive Bayesian model ที่ได้สร้างไว้ (ใช้ model จากตัวอย่างด้านบน(SKlearn หรือ Scratch) ก็ได้) แล้วนั้นให้ทำการ query ข้อมูลเข้าโมเดลโดยมีเงื่อนไขการเลือก query จาก dataset ดังนี้
 - ทำการเลือก query จาก dataset ลำดับที่ m (โดยที่ m คือเลขท้ายประจำตัวของนักศึกษา => ลงท้าย 0 ; m=10)
 - ทำการเปลี่ยน feature ลำดับที่ n (โดยที่ n คือเลขท้ายประจำตัวของนักศึกษา%4 )

EX P'TA รหัส 65010100 (m = 0, n = 0) เลือก query ที่ 0 และ
feature ที่ 0 (Outlook)

```python
query(ก่อนเปลี่ยน) =
{'Outlook': 'Sunny',
'Temperature': 'Hot',
'Humidity': 'High',
'Windy': 'Week'}

feature ที่ 0 (Outlook) เปลี่ยนจาก 'Sunny' เป็นค่าอื่น

query(หลังเปลี่ยน) =
{'Outlook': 'Rain',
'Temperature': 'Hot',
'Humidity': 'High',
'Windy': 'Week'}
```
หลังจากทำการเปลี่ยน query ตามเงื่อนไขเรียบร้อยแล้วนั้นจงแสดงค่าของ Posterior Probability ของ (yes และ no) พร้อมทั้งระบุว่าจาก query ที่มีการเปลี่ยนแปลงแล้วนั้น model ทำนาย class ใดออกมา (ทั้งแบบใช้ Add-1 Smoothing, และไม่ใช้ Add-1 Smoothing)

2. ทำการ train naive bayes model โดยที่ไม่ใช้ Add-1 Smoothing
    - train_test_split(test_size = 0.3, random_state = k) ; k = เลขท้าย 3 ตัวรหัสนักศึกษา
    - ทำการแสดงค่า Likelihood Table ของแต่ละ feature
    - test_data แต่ละตัวให้ทำการแสดง Posterior Probability ของทั้ง 2 class ('No', 'Yes')
    - แสดงค่า Accuracy ที่ได้

EX P'TA รหัส 65010089 (k = 089) train_test_split(test_size = 0.3, random_state = 89)

3. ทำเหมือนกับกับข้อที่ 2 โดยที่ใช้ Add-1 Smoothing
4. หลังจากได้ทำทั้ง 2 model จากข้อ2 และ ข้อ3 (ทั้งแบบใช้ Add-1 Smoothing, และไม่ใช้ Add-1 Smoothing) แล้วนั้นหลังจากที่นักศึกษาได้รับ test_data จากการ split แล้วนั้น นักศึกษาคิดว่าควรใช้ model รูปแบบใดจึงจะเหมาะสมที่สุดเพราะเหตุใด

5. จงระบุเหตุการณ์หรือสถานการณ์ว่าข้อมูลรูปแบบใดควรใช้ Add-1 Smoothing, และไม่ใช้ Add-1 Smoothing

In [109]:
import kagglehub
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from math import sqrt, pi, exp

# Download latest version
folder_path = kagglehub.dataset_download("dheemanthbhat/simple-weather-forecast")

print("Path to dataset files:", folder_path)
print(os.listdir(folder_path))

csv_path = os.path.join(folder_path, 'weather_forecast.csv')
df = pd.read_csv(csv_path)
df

Path to dataset files: /Users/7n100489/.cache/kagglehub/datasets/dheemanthbhat/simple-weather-forecast/versions/1
['weather_forecast.csv']


Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [110]:
## ข้อที่ 1
categorical_feats = ["Outlook", "Temperature", "Humidity", "Windy"]
numerical_feats = []
target_feat = "Play"

# fit model
nb_model_Ex1 = NaiveBayesClassifier(
    categorical_feats, numerical_feats, target_feat, use_add1_smoothing=False
)
nb_model_Ex1.fit(df)

In [111]:
student_id = 65010077
m = student_id % 10
n = student_id % 4

print("M = ", m)
print("N = ", n)

M =  7
N =  1


In [112]:
categorical_feats = ['Outlook', 'Temperature', 'Humidity', 'Windy']
target_feat = 'Play'

In [113]:
query_original = df.iloc[m][categorical_feats].to_dict()
print("Query Original:", query_original)

Query Original: {'Outlook': 'Sunny', 'Temperature': 'Mild', 'Humidity': 'High', 'Windy': 'Weak'}


In [114]:
feat_to_change = categorical_feats[n]
possible_vals = df[feat_to_change].unique().tolist()
new_val = [v for v in possible_vals if v != query_original[feat_to_change]][0]
query_modified = query_original.copy()
query_modified[feat_to_change] = new_val
print("After modification:", query_modified)

After modification: {'Outlook': 'Sunny', 'Temperature': 'Hot', 'Humidity': 'High', 'Windy': 'Weak'}


In [115]:
nb_no_smooth = NaiveBayesClassifier(
    categorical_feats, numerical_feats, target_feat, use_add1_smoothing=False
)
nb_no_smooth.fit(df)
nb_add1 = NaiveBayesClassifier(
    categorical_feats, numerical_feats, target_feat, use_add1_smoothing=True
)
nb_add1.fit(df)

In [144]:
posterior_no_smooth = nb_no_smooth.predict_proba(query_modified)
posterior_smooth = nb_add1.predict_proba(query_modified)

pred_class_no_smooth = nb_no_smooth.predict(query_modified)
pred_class_smooth = nb_add1.predict(query_modified)

print("\nPosterior Probability (No Add-1):")
print(f"P(Yes) = {posterior_no_smooth.get('Yes', 0):.4f}")
print(f"P(No) = {posterior_no_smooth.get('No', 0):.4f}")
print(f"Predicted class (No Add-1): {pred_class_no_smooth}")

print("\nPosterior Probability (Add-1):")
print(f"P(Yes) = {posterior_smooth.get('Yes', 0):.4f}")
print(f"P(No) = {posterior_smooth.get('No', 0):.4f}")
print(f"Predicted class (Add-1): {pred_class_smooth}")


Posterior Probability (No Add-1):
P(Yes) = 0.0786
P(No) = 0.9214
Predicted class (No Add-1): No

Posterior Probability (Add-1):
P(Yes) = 0.1899
P(No) = 0.8101
Predicted class (Add-1): No


2. ทำการ train naive bayes model โดยที่ไม่ใช้ Add-1 Smoothing

In [145]:
random_state = k = int(str(student_id)[-3:])
print("random_state:", random_state)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=random_state)

random_state: 77


In [146]:
nb_no_smooth = NaiveBayesClassifier(
    categorical_feats, numerical_feats, target_feat, use_add1_smoothing=False
)
nb_no_smooth.fit(train_df)

In [147]:
def show_likelihood_tables(
    df, categorical_feats, target_feat, use_add1_smoothing=False
):
    total_yes_samples = df[df[target_feat] == "Yes"].shape[0]
    total_no_samples = df[df[target_feat] == "No"].shape[0]

    for feature_name in categorical_feats:
        print(f"\n=== Likelihood Table for {feature_name.capitalize()} ===")
        num_unique_feature_values = df[feature_name].nunique()

        for feat_value in sorted(df[feature_name].unique()):
            count_feat_class_yes = df[
                (df[target_feat] == "Yes") & (df[feature_name] == feat_value)
            ].shape[0]
            count_feat_class_no = df[
                (df[target_feat] == "No") & (df[feature_name] == feat_value)
            ].shape[0]

            if use_add1_smoothing:
                prob_yes = (count_feat_class_yes + 1) / (
                    total_yes_samples + num_unique_feature_values
                )
                prob_no = (count_feat_class_no + 1) / (
                    total_no_samples + num_unique_feature_values
                )
                fraction_yes_str = f"{count_feat_class_yes + 1}/{total_yes_samples + num_unique_feature_values}"
                fraction_no_str = f"{count_feat_class_no + 1}/{total_no_samples + num_unique_feature_values}"
            else:
                prob_yes = (
                    count_feat_class_yes / total_yes_samples
                    if total_yes_samples > 0
                    else 0
                )
                prob_no = (
                    count_feat_class_no / total_no_samples
                    if total_no_samples > 0
                    else 0
                )
                fraction_yes_str = f"{count_feat_class_yes}/{total_yes_samples}"
                fraction_no_str = f"{count_feat_class_no}/{total_no_samples}"

            print(
                f"{feat_value:9s} | P({feat_value:9s} | Yes) : {fraction_yes_str:<6s} = {prob_yes:.2f} "
                f"| P({feat_value:9s} | No) : {fraction_no_str:<6s} = {prob_no:.2f}"
            )


In [148]:
from sklearn.metrics import accuracy_score

def evaluate_naive_bayes(nb_model, test_df, categorical_feats, target_feat):
    preds = []
    print("\n=== Posterior Probabilities on Test Data ===")
    for i, row in test_df.iterrows():
        query = row[categorical_feats].to_dict()
        posterior = nb_model.predict_proba(query)
        predicted_class = nb_model.predict(query)
        preds.append(predicted_class)
        print(
            f"Test sample {i}: "
            f"P(Yes) = {posterior.get('Yes', 0):.4f}, "
            f"P(No)  = {posterior.get('No', 0):.4f}, "
            f"Predicted = {predicted_class}"
        )
    
    accuracy = accuracy_score(test_df[target_feat].values, preds)
    print(f"\nAccuracy: {accuracy:.4f}")
    return accuracy


In [149]:
print("\n=== Likelihood Tables (No Smoothing) ===")
show_likelihood_tables(train_df, categorical_feats, target_feat, use_add1_smoothing=False)


=== Likelihood Tables (No Smoothing) ===

=== Likelihood Table for Outlook ===
Overcast  | P(Overcast  | Yes) : 2/5    = 0.40 | P(Overcast  | No) : 0/4    = 0.00
Rain      | P(Rain      | Yes) : 1/5    = 0.20 | P(Rain      | No) : 1/4    = 0.25
Sunny     | P(Sunny     | Yes) : 2/5    = 0.40 | P(Sunny     | No) : 3/4    = 0.75

=== Likelihood Table for Temperature ===
Cool      | P(Cool      | Yes) : 2/5    = 0.40 | P(Cool      | No) : 1/4    = 0.25
Hot       | P(Hot       | Yes) : 1/5    = 0.20 | P(Hot       | No) : 2/4    = 0.50
Mild      | P(Mild      | Yes) : 2/5    = 0.40 | P(Mild      | No) : 1/4    = 0.25

=== Likelihood Table for Humidity ===
High      | P(High      | Yes) : 1/5    = 0.20 | P(High      | No) : 3/4    = 0.75
Normal    | P(Normal    | Yes) : 4/5    = 0.80 | P(Normal    | No) : 1/4    = 0.25

=== Likelihood Table for Windy ===
Strong    | P(Strong    | Yes) : 2/5    = 0.40 | P(Strong    | No) : 2/4    = 0.50
Weak      | P(Weak      | Yes) : 3/5    = 0.60 | P(Weak 

In [150]:
accuracy_no_smooth = evaluate_naive_bayes(nb_no_smooth, test_df, categorical_feats, target_feat)


=== Posterior Probabilities on Test Data ===
Test sample 13: P(Yes) = 0.2545, P(No)  = 0.7455, Predicted = No
Test sample 6: P(Yes) = 1.0000, P(No)  = 0.0000, Predicted = Yes
Test sample 2: P(Yes) = 1.0000, P(No)  = 0.0000, Predicted = Yes
Test sample 9: P(Yes) = 0.8600, P(No)  = 0.1400, Predicted = Yes
Test sample 3: P(Yes) = 0.3386, P(No)  = 0.6614, Predicted = No

Accuracy: 0.8000


3. ทำเหมือนกับกับข้อที่ 2 โดยที่ใช้ Add-1 Smoothing

In [151]:
nb_add1 = NaiveBayesClassifier(
    categorical_feats, numerical_feats, target_feat, use_add1_smoothing=True
)
nb_add1.fit(train_df)

In [152]:
print("\n=== Likelihood Tables (Add-1 Smoothing) ===")
show_likelihood_tables(train_df, categorical_feats, target_feat, use_add1_smoothing=True)


=== Likelihood Tables (Add-1 Smoothing) ===

=== Likelihood Table for Outlook ===
Overcast  | P(Overcast  | Yes) : 3/8    = 0.38 | P(Overcast  | No) : 1/7    = 0.14
Rain      | P(Rain      | Yes) : 2/8    = 0.25 | P(Rain      | No) : 2/7    = 0.29
Sunny     | P(Sunny     | Yes) : 3/8    = 0.38 | P(Sunny     | No) : 4/7    = 0.57

=== Likelihood Table for Temperature ===
Cool      | P(Cool      | Yes) : 3/8    = 0.38 | P(Cool      | No) : 2/7    = 0.29
Hot       | P(Hot       | Yes) : 2/8    = 0.25 | P(Hot       | No) : 3/7    = 0.43
Mild      | P(Mild      | Yes) : 3/8    = 0.38 | P(Mild      | No) : 2/7    = 0.29

=== Likelihood Table for Humidity ===
High      | P(High      | Yes) : 2/7    = 0.29 | P(High      | No) : 4/6    = 0.67
Normal    | P(Normal    | Yes) : 5/7    = 0.71 | P(Normal    | No) : 2/6    = 0.33

=== Likelihood Table for Windy ===
Strong    | P(Strong    | Yes) : 3/7    = 0.43 | P(Strong    | No) : 3/6    = 0.50
Weak      | P(Weak      | Yes) : 4/7    = 0.57 | P(We

In [154]:
evaluate_naive_bayes(nb_add1, test_df, categorical_feats, target_feat)


=== Posterior Probabilities on Test Data ===
Test sample 13: P(Yes) = 0.3453, P(No)  = 0.6547, Predicted = No
Test sample 6: P(Yes) = 0.8878, P(No)  = 0.1122, Predicted = Yes
Test sample 2: P(Yes) = 0.4839, P(No)  = 0.5161, Predicted = No
Test sample 9: P(Yes) = 0.7785, P(No)  = 0.2215, Predicted = Yes
Test sample 3: P(Yes) = 0.4128, P(No)  = 0.5872, Predicted = No

Accuracy: 0.6000


0.6

4. หลังจากได้ทำทั้ง 2 model จากข้อ2 และ ข้อ3 (ทั้งแบบใช้ Add-1 Smoothing, และไม่ใช้ Add-1 Smoothing) แล้วนั้นหลังจากที่นักศึกษาได้รับ test_data จากการ split แล้วนั้น นักศึกษาคิดว่าควรใช้ model รูปแบบใดจึงจะเหมาะสมที่สุดเพราะเหตุใด

ตอบ ควรใช้ แบบไม่ใช้ Add-1 Smoothing เพราะมี Acc ที่สูงกว่าและจะพบว่าใน Likelihood table ของเราจะมีกรณี P(Overcast | No) : 0/4 = 0.00 แต่ที่ไม่ใช้เป็นแบบ Add-1 เพราะ Dataset ของเราไม่มี data ไหนที่จะมาเข้ากรณีของ P(Overcast | No)

5. จงระบุเหตุการณ์หรือสถานการณ์ว่าข้อมูลรูปแบบใดควรใช้ Add-1 Smoothing, และไม่ใช้ Add-1 Smoothing

ตอบ ใช้ Add-1 Smoothing ในกรณีที่ใน Likelihood table มีค่าเป็น 0 ทำให้เมื่อนำไปใช้คำนวณความน่าจะเป็นของ class label จะทำให้ผลลัพธ์เป็น 0 ส่วนในกรณีที่ Likelihood table ไม่มีค่าเป็น 0 ก็ไม่จำเป็นต้องใช้ Add-1 Smoothing