In [11]:
# importing the Dataset
import pandas as pd
messages = pd.read_csv('/home/m/Desktop/NLP-Natural-Language-Processing/Spam-Classifier/SpamCollectionDataset/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [12]:
#Data cleaning and preprocessing
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    processed_message = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    processed_message = processed_message.lower()
    processed_message = processed_message.split()
    
    processed_message = [ps.stem(word) for word in processed_message if not word in stopwords.words('english')]
    processed_message = ' '.join(processed_message)
    corpus.append(processed_message)

In [13]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

y=pd.get_dummies(messages['label'])     #categorical data=>binary(one-hot encoding)
y=y.iloc[:,1].values    #No need to have 2 colums, just one column is enough

In [14]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)
'''
A Naive Bayes classifier is a probabilistic machine learning model used for classification tasks.
It is based on Bayes' Theorem, which describes the probability of an event based on prior knowledge
of conditions related to the event.
The "naive" part of the name comes from the assumption that all features are independent of each other,
which is rarely true in real-world data but simplifies the computation.
'''

y_pred=spam_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
from sklearn.metrics import accuracy_score
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy_percentage = accuracy * 100
print(f'Accuracy: {accuracy_percentage:.2f}%')