In [2]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('SMSSpamCollection.txt',delimiter='\t',header=None,names=['label','message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [16]:
#Example of ham
data['message'][25]

"Just forced myself to eat a slice. I'm really not hungry tho. This sucks. Mark is getting worried. He knows I'm sick when I turn down pizza. Lol"

In [22]:
#Example of spam
data['message'][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [24]:
#Removing NAN values
data.isnull().sum()

label      0
message    0
dtype: int64

In [25]:
#Detecting and removing empty strings
blanks = []
for i,lab,mes in data.itertuples():
    if type(mes) == str:
        if mes.isspace():
            blanks.append(i)
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


**So there are no empty spaces**

### Splitting data in to train and test set

In [26]:
from sklearn.model_selection import train_test_split

X = data['message']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Build pipelines to vectorize the data, then train and fit a model

**Now that we have sets to train and test, we'll develop a selection of pipelines, each with a different model.**

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('clf', LinearSVC()),
])

## First Pipeline : 

In [35]:
text_clf_nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [36]:
predictions = text_clf_nb.predict(X_test)
# Report the confusion matrix
from sklearn import metrics
print(f'Confusion Matrix \n\n {metrics.confusion_matrix(y_test,predictions)} \n')
# Print a classification report
print(f'Classification Report \n\n {metrics.classification_report(y_test,predictions)}\n')
# Print the overall accuracy
print(f'Accuracy Score \n\n {metrics.accuracy_score(y_test,predictions)}')

Confusion Matrix 

 [[1592    1]
 [  54  192]] 

Classification Report 

               precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1593
        spam       0.99      0.78      0.87       246

   micro avg       0.97      0.97      0.97      1839
   macro avg       0.98      0.89      0.93      1839
weighted avg       0.97      0.97      0.97      1839


Accuracy Score 

 0.9700924415443176


## Second Pipeline :

In [37]:
text_clf_lsvc.fit(X_train, y_train)

predictions = text_clf_lsvc.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(f'Confusion Matrix \n\n {metrics.confusion_matrix(y_test,predictions)} \n')

# Print a classification report
print(f'Classification Report \n\n {metrics.classification_report(y_test,predictions)}\n')

# Print the overall accuracy
print(f'Accuracy Score \n\n{metrics.accuracy_score(y_test,predictions)}')

Confusion Matrix 

 [[1587    6]
 [  15  231]] 

Classification Report 

               precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.94      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.97      1839
weighted avg       0.99      0.99      0.99      1839


Accuracy Score 

0.9885807504078303
