# Problem Statement

Spam filtering using NaivE Bayes classifiers in order to predict whether a new mail based on its content, can be categorized as spam or not-spam.

Data processing using panda library

In [8]:
# Import the required Libraries 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
import string
import matplotlib.pyplot as plt

In [14]:
# laod the dataset

data = pd.read_csv("spam.tsv", sep='\t', names=['Class', 'Message'], encoding='ISO-8859-1')
data.head(2)


Unnamed: 0,Class,Message
0,"v1,v2,,,",
1,"ham,""Go until jurong point, crazy.. Available ...",


In [15]:
# Summary of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5575 entries, 0 to 5574
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Class    5575 non-null   object 
 1   Message  0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 87.2+ KB


In [22]:
# Create a column to keep the count of the characters present in each record
data['Length'] = data['Message'].apply(len)


In [17]:
data.head(5)

Unnamed: 0,Class,Message
0,"v1,v2,,,",
1,"ham,""Go until jurong point, crazy.. Available ...",
2,"ham,Ok lar... Joking wif u oni...,,,",
3,"spam,Free entry in 2 a wkly comp to win FA Cup...",
4,"ham,U dun say so early hor... U c already then...",


In [18]:
data.describe()

Unnamed: 0,Message
count,0.0
mean,
std,
min,
25%,
50%,
75%,
max,


In [19]:
data['Class'].value_counts()

Class
ham,"Sorry, I'll call later",,,                                                                                                                                        30
ham,I cant pick the phone right now. Pls send a message,,,                                                                                                             12
ham,Ok...,,,                                                                                                                                                           10
ham,Ok,,,                                                                                                                                                               4
ham,Your opinion about me? 1. Over 2. Jada 3. Kusruthi 4. Lovable 5. Silent 6. Spl character 7. Not matured 8. Stylish 9. Simple Pls reply..,,,                         4
                                                                                                                                                

#  Text Pre-Processing

In [23]:
# lets assign ham as 1
data.loc[data['Class']=="ham","Class"] = 1

In [24]:
# lets assign spam as 0
data.loc[data['Class']=="spam","Class"] = 0

In [25]:
data.head(8)

Unnamed: 0,Class,Message,Length
0,"v1,v2,,,",,3
1,"ham,""Go until jurong point, crazy.. Available ...",,3
2,"ham,Ok lar... Joking wif u oni...,,,",,3
3,"spam,Free entry in 2 a wkly comp to win FA Cup...",,3
4,"ham,U dun say so early hor... U c already then...",,3
5,"ham,""Nah I don't think he goes to usf, he live...",,3
6,"spam,""FreeMsg Hey there darling it's been 3 we...",,3
7,"ham,Even my brother is not like to speak with ...",,3


# First let's remove punctuation. We can just take advantage of Python's built-in string library to get a quick list of all possible punctuation:

In [26]:
# why is it important to remove punctuation?

"This message is spam" == "This message is spam."

False

In [27]:
# get the default list of punctuations in python
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [29]:
# creating a function toremove the punctuation
 
def remove_punct(text):
     text = "".join([char for char in text if char not in string.punctuation])
     return text

In [30]:
s = "data// scinece!|"
remove_punct(s)

'data scinece'

In [31]:
text = []
for i in data['Message']:
    t = remove_punct(i)
    text.append(t)
    

In [32]:
data['Text_clean'] = text
data

Unnamed: 0,Class,Message,Length,Text_clean
0,"v1,v2,,,",,3,
1,"ham,""Go until jurong point, crazy.. Available ...",,3,
2,"ham,Ok lar... Joking wif u oni...,,,",,3,
3,"spam,Free entry in 2 a wkly comp to win FA Cup...",,3,
4,"ham,U dun say so early hor... U c already then...",,3,
...,...,...,...,...
5570,"spam,""This is the 2nd time we have tried 2 con...",,3,
5571,"ham,Will Ì_ b going to esplanade fr home?,,,",,3,
5572,"ham,""Pity, * was in mood for that. So...any ot...",,3,
5573,"ham,The guy did some bitching but I acted like...",,3,


In [35]:
# creating new column text_clean to hold the cleaned text

data['text_clean'] = data['Message'].apply(lambda x: remove_punct(x))

# the lambda keyword is used to create ananymous functions.

# view the dataset
data.head()

Unnamed: 0,Class,Message,Length,Text_clean,text_clean
0,"v1,v2,,,",,3,,
1,"ham,""Go until jurong point, crazy.. Available ...",,3,,
2,"ham,Ok lar... Joking wif u oni...,,,",,3,,
3,"spam,Free entry in 2 a wkly comp to win FA Cup...",,3,,
4,"ham,U dun say so early hor... U c already then...",,3,,


Now we need to convert each of those messages info a vector ( The way the ML models can understand and can work with).

In [36]:
# Splitting x and y

x = data['text_clean'].values
y = data['Class'].values

x

array(['nan', 'nan', 'nan', ..., 'nan', 'nan', 'nan'], dtype=object)

In [37]:
y

array(['v1,v2,,,',
       'ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,',
       'ham,Ok lar... Joking wif u oni...,,,', ...,
       'ham,"Pity, * was in mood for that. So...any other suggestions?",,,',
       "ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free,,,",
       'ham,Rofl. Its true to its name,,,'], dtype=object)

In [49]:
# Datatype for y is object. Lets convert it into int
# Fill NaN values with 0
y_numeric = y_numeric.fillna(0)

# Convert the cleaned data to integers
y = y_numeric.astype('int')
y

0    1
1    2
3    4
4    5
dtype: int32

# Splitting Train and Test Data

In [42]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=10)
x_train.shape

(4460,)

In [43]:
x_test.shape

(1115,)

Tokenization means breaking down a sentence or paragraph or any text into words.

Count Vectorizer tokenization the text along with performing vry basic preprocessing like removing the punctuation marks, converting all the words to lowercase, etc. The vocuablary of known words is formed which is also used for encoding unseen text later. It will convert a collection of text documents to a matrix of taken counts.

# Bag of Words

We cannot pass text directly to train our models in Natural Language Processing, thus we need to convert it into numbers, which machine can understand and can perform the required modelling on it.

In [51]:
# CountVectorizer (Bag of words) to extract the features from rext

In [53]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initializing the object for CountVectorizer
CV = CountVectorizer(stop_words="english")

# Initializing the object for TfidfVectorizer
TFIDF = TfidfVectorizer(stop_words="english")


[Stopwords are the words in any language which does not add much meaning to a sentence. They are the words which are very common in text documents such as a, an, the, you, your, etc. The Stop words highly appear in text documents. However, they are not being helpful for text analysis in many of the cases.
So it is better to remove from the text. We can focus on the important words if stop words are removed.]

In [54]:
# Apply countvectorizer functionality on the data to convert
# the categorical data into vectors
x_train_CV = CV.fit_transform(x_train)

In [62]:
import warnings
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

warnings.filterwarnings('ignore')

# Getting feature names
feature_names = CV.get_feature_names_out()

# Display the feature names
print(feature_names)


['document' 'sample']


# Training a Model



In [63]:
# Initializing the model
NB = MultinomialNB()

In [65]:
# feed data to the model
# NB.fit(xset_train_CV,y_train)
NB.fit(x_train_CV,y_train)

In [66]:
# Let's apply CV on our test data.
x_test_CV = CV.transform(x_test)

In [70]:
# Prediction for xSet_test_CV

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Example data
data = ["This is a sample document.", "This document is another sample document.", "Sample text data for training.", "Another example of text data."]
labels = [0, 1, 0, 1]  # Example labels

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Initialize the CountVectorizer
CV = CountVectorizer(stop_words="english")

# Fit the CountVectorizer on the training data
x_train_CV = CV.fit_transform(x_train)

# Transform the test data using the same CountVectorizer
x_test_CV = CV.transform(x_test)

# Initialize and train the Naive Bayes classifier
NB = MultinomialNB()
NB.fit(x_train_CV, y_train)

# Predict the labels for the test data
y_predict = NB.predict(x_test_CV)

# Display the predictions
print(y_predict)



[0]


In [73]:
# Classification report 
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



In [74]:
# confusion matrix
pd.crosstab(y_test,y_predict)

col_0,0
row_0,Unnamed: 1_level_1
1,1
