In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('spam.csv', encoding='latin1')

In [3]:
dataset = dataset.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [4]:
dataset.shape

(5572, 2)

In [5]:
dataset.duplicated().sum()

403

In [6]:
dataset.drop_duplicates(inplace=True)

In [7]:
dataset.columns=['label','message']

In [8]:
#from sklearn.preprocessing import LabelEncoder
#encoder = LabelEncoder()
#df['label'] = encoder.fit_transform(df['label'])

dataset['label_num'] = dataset['label'].map({'ham': 0,'spam':1})

In [9]:
dataset = dataset[['message','label_num']]

In [10]:
dataset.head()

Unnamed: 0,message,label_num
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
dataset.isnull().sum()

message      0
label_num    0
dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X = dataset['message']
y = dataset['label_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
dataset['label_num'].value_counts()

0    4516
1     653
Name: label_num, dtype: int64

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Train the model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Predict
y_pred = model.predict(X_test_vec)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9835589941972921


In [16]:
import nltk
nltk.download('punkt')
nltk.data.path.append("C:/Users/hp340/AppData/Roaming/nltk_data")
dataset['num_characters'] = dataset['message'].apply(len)
dataset['num_word'] = dataset['message'].apply(lambda x:len(nltk.word_tokenize(x)))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp340\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


3. Data Preprocessing
Lower case

Tokenization

Removing special characters

Removing stop words and punctuation

Stemming

In [17]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
ps = PorterStemmer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp340\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def transform_text(text):
    text = text.lower()
    text= nltk.word_tokenize(text)
    
    y =[]
    for i in text:
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
        
    text = []
    for i in y:
        text.append(ps.stem(i))
    return " ".join(text)


In [19]:
transform_text('hello how did you get to the dancing room?')

'hello get danc room'

In [20]:
dataset['message']= dataset['message'].apply(transform_text)

In [21]:
dataset.head()

Unnamed: 0,message,label_num,num_characters,num_word
0,go jurong point crazi avail bugi n great world...,0,111,24
1,ok lar joke wif u oni,0,29,8
2,free entri 2 wkli comp win fa cup final tkt 21...,1,155,37
3,u dun say earli hor u c alreadi say,0,49,13
4,nah think goe usf live around though,0,61,15


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(dataset['message']).toarray()
print(tfidf.get_feature_names_out())
print(X)


['008704050406' '0089' '0121' ... 'åôrent' 'ìä' 'ìï']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [24]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [25]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()


In [26]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

0.8762088974854932
[[793 103]
 [ 25 113]]
0.5231481481481481


In [27]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

0.9593810444874274
[[896   0]
 [ 42  96]]
1.0


In [28]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

0.9700193423597679
[[893   3]
 [ 28 110]]
0.9734513274336283


In [29]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))