In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv("SMSSpamCollection",sep="\t",names = ["label","messages"])

In [3]:
messages.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [5]:
corpus = []
ps = PorterStemmer()

In [6]:
for i in range(len(messages)):
    review = re.sub("[^A-Za-z]"," ",messages["messages"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

In [8]:
X.shape

(5572, 5000)

In [9]:
y = pd.get_dummies(messages["label"],drop_first=True)
y.columns = ["Classifier"]

In [10]:
y

Unnamed: 0,Classifier
0,0
1,0
2,1
3,0
4,0
5,1
6,0
7,0
8,1
9,1


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
spam_detect_model = MultinomialNB().fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [15]:
y_pred = spam_detect_model.predict(X_test)

In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
cm = confusion_matrix(y_test,y_pred)

In [18]:
cm

array([[946,   9],
       [  8, 152]])

In [19]:
from sklearn.metrics import accuracy_score

In [20]:
accuracy = accuracy_score(y_test,y_pred)

In [21]:
accuracy

0.9847533632286996

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [23]:
pipe = Pipeline([("TfIdf",TfidfVectorizer()),("model",LinearSVC())])

In [24]:
import pandas as pd
messages = pd.read_csv("SMSSpamCollection",sep="\t",names = ["label","messages"])
X = messages["messages"]
y=messages["label"]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('TfIdf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [25]:
y_pred=pipe.predict(X_test)

In [26]:
y_pred

array(['ham', 'spam', 'ham', ..., 'ham', 'spam', 'ham'], dtype=object)

In [27]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [28]:
cm = confusion_matrix(y_test,y_pred)

In [29]:
cm

array([[954,   1],
       [  9, 151]])

In [30]:
accuracy = accuracy_score(y_test,y_pred)

In [31]:
accuracy

0.9910313901345291

In [32]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       955
        spam       0.99      0.94      0.97       160

   micro avg       0.99      0.99      0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

