In [2]:
#Perform Imports

import numpy as np
import pandas as pd

In [7]:
#Load Dataset

df = df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\t')

In [8]:
df.head(10)

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8
6,ham,Even my brother is not like to speak with me. ...,77,2
7,ham,As per your request 'Melle Melle (Oru Minnamin...,160,6
8,spam,WINNER!! As a valued network customer you have...,157,6
9,spam,Had your mobile 11 months or more? U R entitle...,154,2


In [9]:
len(df)

5572

In [10]:
#Check for Missing Values

df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [11]:
#Check the labels - ham and spam

df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [12]:
#Since approximately 87% of our data is ham, our model has to perform better than 87% to beat random chance
#Let's work on our model, then

In [13]:
#Import models and split the data into train and test sets

from sklearn.model_selection import train_test_split

X = df['message'] 
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [15]:
#To process the text, tokenize and filter out stopwords we must use CountVectorizer
#After that, we would have also to use Tf-idf to transform counts to frequencies because we want to add weights to the words...
#The more frequent a word is, lower will be its weight
#Luckly, we can use TfidVectorizer - it will do both steps at once (Vectorize and add the weights)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) 
X_train_tfidf.shape

(3900, 7263)

In [18]:
#Now we need a classifier
#We will use LinearSVC because it deals better with sparse input

from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [19]:
#Let's make the process easier and create a pipeline so we don't need to repeat the same process above with our test set

from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

  if LooseVersion(joblib_version) < '0.12':


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [20]:
#Now our classifer is ready to go. Let's test it and see some of the results

predictions = text_clf.predict(X_test)

In [21]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1445    3]
 [  10  214]]


In [22]:
# We can see by the results in the confusion matrix that our model is working very well
#Let's now print the classification report

print(metrics.classification_report(y_test,predictions))



              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672



In [23]:
#Excellent results, let's now just check the accuracy of our model

print(metrics.accuracy_score(y_test,predictions))

0.9922248803827751


In [None]:
#Awesome number. Our model is ready to be used