In [54]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import re


df = pd.read_csv(r"D:\ML_and_DL\Machine_Learning\spam.csv")
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df


Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [55]:

X = df['Message'].values
y = df['spam'].values
print(X,'\n\n\n\n',y)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 ... 'Pity, * was in mood for that. So...any other suggestions?'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
 'Rofl. Its true to its name'] 



 [0 0 1 ... 0 0 0]


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train.shape, X_test.shape)

(4457,) (1115,)


### TEXT PREPROCESSING

In [57]:
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

In [58]:
print(tokenize("Hello, World"))

['hello', 'world']


## BAG OF WORDS

In [59]:
def build_vocab(texts):
    vocab = set()
    for text in texts:
        vocab.update(tokenize(text))
    vocab = sorted(list(vocab))
    word2idx = {word: i for i, word in enumerate(vocab)}
    return vocab, word2idx
vocab, word2idx = build_vocab(X_train)
V = len(vocab)
print(V)

7728


In [60]:
print(word2idx)
print(vocab)



In [61]:
def message_to_vector(text):
    vec = np.zeros(V)

    for word in tokenize(text):
        # print(word)
        if word in word2idx:
            # print(word2idx[word])
            vec[word2idx[word]] += 1
            # print(vec[word2idx[word]] )
    return vec

In [62]:
text=message_to_vector("Hello, Worlds")
text[7577]

np.float64(1.0)

# NAIVE BAYES

In [63]:
class NaiveBayesBOW:
    def __init__(self):
        self.class_priors = {}
        self.word_likelihoods = {}
        self.class_word_counts = {}
        self.vocab_size = V

    def fit(self, X, y):
        classes = np.unique(y)
        n_samples = len(y)

        # Priors
        self.class_priors = {c: np.sum(y == c) / n_samples for c in classes}

        # Initialize counts
        self.class_word_counts = {c: np.ones(self.vocab_size) for c in classes}  # Laplace smoothing
        total_words_per_class = {c: self.vocab_size for c in classes}  # start with vocab_size for smoothing

        # Count words
        for text, label in zip(X, y):
            vec = message_to_vector(text)
            self.class_word_counts[label] += vec
            total_words_per_class[label] += np.sum(vec)

        # Compute likelihoods
        self.word_likelihoods = {
            c: self.class_word_counts[c] / total_words_per_class[c]
            for c in classes
        }

    def predict(self, X):
        predictions = []
        for text in X:
            vec = message_to_vector(text)
            scores = {}
            for c in self.class_priors:
                # Start with log prior
                log_prob = np.log(self.class_priors[c])
                # Add log likelihoods
                log_prob += np.sum(vec * np.log(self.word_likelihoods[c]))
                scores[c] = log_prob
            predictions.append(max(scores, key=scores.get))
        return np.array(predictions)

In [64]:
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

In [65]:
nb = NaiveBayesBOW()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print(f"NaiveBayes (BOW) spam/ham classification accuracy: {accuracy(y_test, predictions):.2f}")


NaiveBayes (BOW) spam/ham classification accuracy: 0.99


In [68]:
nb.predict(['Hi, How are you'])

array([0])

In [67]:

test_messages = [
    "Congratulations! You've won a free iPhone. Click here to claim now!",  # spam example
    "Hey, are we still meeting for lunch today?",  # ham example
    "Urgent! Your bank account needs verification. Click this link now!",  # spam example
]

custom_preds = nb.predict(test_messages)
print(custom_preds)


[1 0 1]
