In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
%matplotlib inline

messages = [line.rstrip() for line in open('smsspamcollection/SMSSpamCollection')]

# for mess_no,message in enumerate(messages[:10]):
#     print(mess_no,message)
#     print()

messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

print(messages.groupby('label').describe())

messages['length'] = messages['message'].apply(len)

print(messages.head())

# sns.FacetGrid(messages,col='label').map(sns.histplot,'length')

# mess = 'Sample message! Notice: it has punctuation.'

# nopunc = ''.join([c for c in mess if c not in string.punctuation])

# clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

def text_process(mess):
    nopunc = ''.join([char for char in mess if char not in string.punctuation])

    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

# messages['message'] = messages['message'].apply(text_process)

# print(messages['message'][0])

      message                                                               
        count unique                                                top freq
label                                                                       
ham      4825   4516                             Sorry, I'll call later   30
spam      747    653  Please call our customer service representativ...    4
  label                                            message  length
0   ham  Go until jurong point, crazy.. Available only ...     111
1   ham                      Ok lar... Joking wif u oni...      29
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155
3   ham  U dun say so early hor... U c already then say...      49
4   ham  Nah I don't think he goes to usf, he lives aro...      61


In [8]:
bow_transformer = CountVectorizer(analyzer = text_process).fit(messages['message'])

In [16]:
messages_bow = bow_transformer.transform(messages['message'])

In [10]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [17]:
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [4]:
from sklearn.naive_bayes import MultinomialNB # can use other classifiers like random forest

In [5]:
spam_detect_model = MultinomialNB().fit(messages_tfidf,messages['label'])

In [18]:
predictions = spam_detect_model.predict(messages_tfidf)

In [19]:
predictions

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [20]:
from sklearn.model_selection import train_test_split
train_test_split

In [21]:
msg_train,msg_test,label_train,label_test = train_test_split(messages['message'],messages['label'],test_size=0.3)

In [22]:
from sklearn.pipeline import Pipeline

In [23]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB())
])
# strings don't matter really
# pipeline in just a list of things that you are going to do

In [24]:
pipeline.fit(msg_train,label_train) # in this one step pipeline does all the steps above

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x0A62F418>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [25]:
predictions = pipeline.predict(msg_test)

In [26]:
from sklearn.metrics import classification_report

In [27]:
print(classification_report(label_test,predictions))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1489
        spam       1.00      0.69      0.82       183

    accuracy                           0.97      1672
   macro avg       0.98      0.84      0.90      1672
weighted avg       0.97      0.97      0.96      1672

