In [3]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../data/spam.csv')

In [5]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [6]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [7]:
df


Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [8]:
x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size = 0.2)

In [9]:

# -------------------------------
# 1. Hàm tiền xử lý văn bản
# -------------------------------
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    tokens = text.split()
    return tokens

# -------------------------------
# 2. Huấn luyện Naive Bayes
# -------------------------------
class NaiveBayesTest:
    def __init__(self):
        self.vocab = set()
        self.word_freq_spam = Counter()
        self.word_freq_ham = Counter()
        self.total_spam_words = 0
        self.total_ham_words = 0
        self.p_spam = 0
        self.p_ham = 0

    def fit(self, X_train, y_train):
        spam_messages = []
        ham_messages = []

        for msg, label in zip(X_train, y_train):
            tokens = tokenize(msg)

            # gom từ vào vocab
            self.vocab.update(tokens)

            if label == 1:  # spam
                spam_messages.extend(tokens)
            else:
                ham_messages.extend(tokens)

        # Đếm số từ
        self.word_freq_spam = Counter(spam_messages)
        self.word_freq_ham = Counter(ham_messages)

        self.total_spam_words = sum(self.word_freq_spam.values())
        self.total_ham_words = sum(self.word_freq_ham.values())

        # Xác suất tiên nghiệm
        self.p_spam = sum(y_train) / len(y_train)
        self.p_ham = 1 - self.p_spam

    # -------------------------------
    # Tính log(P(word | class))
    # -------------------------------
    def word_likelihood(self, word, class_type):
        vocab_size = len(self.vocab)
        if class_type == "spam":
            return np.log((self.word_freq_spam[word] + 1) / (self.total_spam_words + vocab_size))
        else:
            return np.log((self.word_freq_ham[word] + 1) / (self.total_ham_words + vocab_size))

    # -------------------------------
    # 3. Dự đoán
    # -------------------------------
    def predict(self, message):
        tokens = tokenize(message)
        
        # log probabilities tránh underflow
        log_spam = np.log(self.p_spam)
        log_ham = np.log(self.p_ham)

        for word in tokens:
            log_spam += self.word_likelihood(word, "spam")
            log_ham += self.word_likelihood(word, "ham")

        return 1 if log_spam > log_ham else 0

    def predict_batch(self, X):
        return [self.predict(m) for m in X]

In [10]:
# -------------------------------
# Huấn luyện mô hình
# -------------------------------
nb = NaiveBayesTest()
nb.fit(x_train, y_train)

In [11]:
def check_message(msg):
    pred = nb.predict(msg)
    if pred == 1:
        return "spam"
    else:
        return "ham"

# Ví dụ kiểm tra
check_message("can we meet tomorrow?")


'ham'

In [12]:
# -------------------------------
# Đánh giá mô hình
# -------------------------------
preds = nb.predict_batch(x_test)
accuracy = np.mean(np.array(preds) == np.array(y_test))
accuracy


np.float64(0.9874439461883409)