In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == "spam" else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.25)
# An 75:25 train-test split is used to ensure the model has enough data for training while reserving data for evaluation.

In [7]:
X_train.shape

(4179,)

In [8]:
X_test.shape

(1393,)

In [9]:
type(X_train)

In [33]:
X_train[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [29]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count = X_train_count.toarray()

In [35]:
X_train_count[0]

array([0, 0, 0, ..., 0, 0, 0])

In [30]:
type(X_train_count)

numpy.ndarray

In [39]:
v.get_feature_names_out()[872]

'aha'

In [31]:
np.where(X_train_count[0] != 0)

(array([ 872, 3221, 4158, 4924, 7496]),)

In [40]:
X_train[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [42]:
# Model Training 
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [43]:
X_test_count = v.transform(X_test)

In [46]:
# Model Predictions 
y_pred = model.predict(X_test_count)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1215
           1       0.96      0.93      0.95       178

    accuracy                           0.99      1393
   macro avg       0.97      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [47]:
# Create a Pipeline to make prediction 

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1215
           1       0.96      0.93      0.95       178

    accuracy                           0.99      1393
   macro avg       0.97      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [None]:
# Summary: converting text data into numerical features based on word frequency 
# and training a Naive Bayes classifier. The performance of the classifier is then evaluated and reported. 
# Created a pipeline to simplify the workflow from preprocessing to evaluation. 
# This method efficiently transforms text data into a format suitable for classification and assesses the model's effectiveness.