In [1]:
import pandas as pd 

In [2]:
messages = pd.read_csv("SMSSpamCollection",sep="\t", names = ["label","message"])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Cleaning the data

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()

In [4]:
corpus = []
for i in range(len(messages)):
    review = re.sub("[^a-zA-Z]"," ",messages["message"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)

### Making Bag of words Model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 2500)
X = cv.fit_transform(corpus).toarray()

In [6]:
print(X, X.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (5572, 2500)


In [25]:
# making target class
y = pd.get_dummies(messages["label"])
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

### Splitting data into test train sets 

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)


### Training Model using Naive Bayes Model

In [27]:
from sklearn.naive_bayes import MultinomialNB

In [28]:
spam_detection_model = MultinomialNB().fit(X_train, y_train)

In [29]:
y_pred = spam_detection_model.predict(X_test)

In [30]:
y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

### Self made accuracy

In [37]:
counter = 0
for i in range(len(y_test)):
    if y_test[i] == y_pred[i]:
        counter+=1

print(counter)
print(counter/len(y_test))

1099
0.9856502242152466


### Confusion Matrix Metric

In [32]:
from sklearn.metrics import confusion_matrix

In [33]:
confusion_m = confusion_matrix(y_test, y_pred)
confusion_m

array([[946,   9],
       [  7, 153]], dtype=int64)

### Accuracy Score

In [34]:
from sklearn.metrics import accuracy_score

In [36]:
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.9856502242152466

## Testing using lemmetizer and TFIDF

In [38]:
wl = WordNetLemmatizer()

In [39]:
corpus = []
for i in range(len(messages)):
    review = re.sub("[^a-zA-Z]"," ",messages["message"][i])
    review = review.lower()
    review = review.split()
    review = [wl.lemmatize(word) for word in review if word not in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(max_features = 2500)
X = cv.fit_transform(corpus).toarray()

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,  random_state=0)
new_spam_model = MultinomialNB().fit(X_train,y_train)
y_pred2 = new_spam_model.predict(X_test) 

In [47]:
accuracy2 = accuracy_score(y_pred2, y_test)
accuracy2

0.979372197309417

Accuracy decreases.

Let us just use BOW with lemmatize

In [48]:
cv = CountVectorizer(max_features = 2500)
X = cv.fit_transform(corpus).toarray()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,  random_state=0)
new_spam_model = MultinomialNB().fit(X_train,y_train)
y_pred2 = new_spam_model.predict(X_test) 
accuracy2 = accuracy_score(y_pred2, y_test)
accuracy2

0.9829596412556054

Still lower than stem!