In [32]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [17]:
#read dataset
#dataset link: https://archive.ics.uci.edu/ml/datasets/sms+spam+collection 
messages = pd.read_csv('SMSSpamCollection', sep = '\t', 
                      names = ['label', 'messege'])

In [18]:
messages.head()

Unnamed: 0,label,messege
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
#data preprocessing
stemmer = PorterStemmer() #trying first with stemming
corpus = []

for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['messege'][i])
    review = review.lower()
    words = nltk.word_tokenize(review)
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    review = ' '.join(words)
    
    corpus.append(review)

In [12]:
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [20]:
#Bagofwords Model
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [21]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values #extracted as 0 is spam and 1 is not spam

In [25]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 101)

In [27]:
#Training using Naive Bayes classifier
spam_detector = MultinomialNB().fit(X_train, y_train)

In [28]:
#Getting Prediction
y_pred = spam_detector.predict(X_test)

In [35]:
#confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[969  11]
 [  8 127]]


In [38]:
#accuracy score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9829596412556054

In [None]:
#next experiment
# lemmetization instead of stemming
# tf/idf instead of bag of words

In [39]:
#data preprocessing
lem = WordNetLemmatizer()
corpus = []

for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['messege'][i])
    review = review.lower()
    words = nltk.word_tokenize(review)
    words = [lem.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    review = ' '.join(words)
    
    corpus.append(review)

In [40]:
corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though']

In [41]:
#if-idf mode
cv = TfidfVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [42]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values #extracted as 0 is spam and 1 is not spam

In [43]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 101)

In [44]:
#Training using Naive Bayes classifier
spam_detector = MultinomialNB().fit(X_train, y_train)

In [45]:
#Getting Prediction
y_pred = spam_detector.predict(X_test)

In [46]:
#confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[977   3]
 [ 22 113]]


In [47]:
#accuracy score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9775784753363229

In [None]:
#Took help from krish Naik tutorials, Thanks to them!