In [21]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [22]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [23]:
# Replace ham with 0 and spam with 1
df = df.replace(['ham','spam'],[0, 1]) 
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [24]:
#count number of spam and ham
df['v1'].value_counts()

0    4825
1     747
Name: v1, dtype: int64

In [25]:
#number of characters in each message
df['length'] = df['v2'].apply(len)
df.head()

Unnamed: 0,v1,v2,length
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [27]:
corpus = []
ps = PorterStemmer()


In [28]:
# Cleaning the texts
for i in range(0, 5572):
    review = re.sub('[^a-zA-Z]', ' ', df['v2'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    
# Creating the Bag of Words model
cv = CountVectorizer(max_features = 2500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 0].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# label encoding
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# Fitting Naive Bayes to the Training set
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
    

In [29]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 85.65%


In [30]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[811 138]
 [ 22 144]]


In [31]:
# Classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.85      0.91       949
           1       0.51      0.87      0.64       166

    accuracy                           0.86      1115
   macro avg       0.74      0.86      0.78      1115
weighted avg       0.90      0.86      0.87      1115



In [32]:
# Fitting Decision Tree Classification to the Training set
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt.fit(X_train, y_train)

In [33]:
# Predicting the Test set results
y_pred = dt.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 97.22%


In [34]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Classification report
print(classification_report(y_test, y_pred))


[[944   5]
 [ 26 140]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       949
           1       0.97      0.84      0.90       166

    accuracy                           0.97      1115
   macro avg       0.97      0.92      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [35]:
# final result print 
print("Naive Bayes Accuracy: %.2f%%" % (accuracy * 100.0))
print("Decision Tree Accuracy: %.2f%%" % (accuracy * 100.0))

Naive Bayes Accuracy: 97.22%
Decision Tree Accuracy: 97.22%
