# Step 1. Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import string
import re

# Step 2. Importing Dataset

In [3]:
df = pd.read_csv('smsspamcollection.tsv', sep ="\t")

In [4]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111.0,9.0
1,ham,Ok lar... Joking wif u oni...,29.0,6.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155.0,6.0
3,ham,U dun say so early hor... U c already then say...,49.0,6.0
4,ham,"Nah I don't think he goes to usf, he lives aro...",61.0,2.0


In [5]:
df['label'].value_counts()

ham     4826
spam     757
Name: label, dtype: int64

This dataset contains more ham message than spam. So, False Positives will be more, which is somewhat good cause we don't want to miss our friends message just because it goes into spam.

#  Step 3. Data Preprocessing

#### Step 3.(a). Checking if there is any Null (NaN) values

In [6]:
df.isnull().sum()

label       0
message     0
length     11
punct      11
dtype: int64

It looks like we have no NaN values

#### Step 3.(b). Checking if there is just an empty string in any message

In [7]:
blanks = []

for index,label,msg, len, punc in df.itertuples():
    
    if msg.isspace():
        blanks.append(i)
print(blanks)    

[]


No empty string in place of any message. So Data is complete

In [8]:
message_data_copy = df['message']

In [9]:
message_data_copy

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5578    This is the 2nd time we have tried 2 contact u...
5579                 Will ü b going to esplanade fr home?
5580    Pity, * was in mood for that. So...any other s...
5581    The guy did some bitching but I acted like i'd...
5582                           Rofl. Its true to its name
Name: message, Length: 5583, dtype: object

#### Step 3.(c). Cleaning text like removing punctuation and correcting abbreviations and other short form words

In [10]:
def clean_text(text):
    
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"\'ll'", " will", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'ve", " have", text)    
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"here's", "here is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"[-()\"#@$%^&*;.|<>?,=+]", "", text)
    
    return text

In [11]:
message_data_copy = message_data_copy.apply(clean_text)

In [12]:
message_data_copy.head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i don't think he goes to usf he lives arou...
Name: message, dtype: object

#  Step 4. Splitting Dataset into Training and Test Set

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
#X will be our messages on which our prediction depends
#y will be our label, which we have to predict for test

X = message_data_copy
y = df['label']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3740,)
(3740,)
(1843,)
(1843,)


# Step 5. Extracting Text Feature

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorizer = TfidfVectorizer()

In [19]:
X_train_tfidf = vectorizer.fit_transform(X_train)

#  Step 6. We have to perform the Text Feature Extraction  processes on our test set too. But there is a convenient way of doing this. 



#### We can use pipeline, which will Extarct the features as well as Train and Fit our classifier too.  So, we can basically skip Step 5.

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [21]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [22]:
text_clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

# Model Evaluation

In [23]:
predictions = text_clf.predict(X_test)

In [24]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score  

#### Confusion Matrix

In [25]:
print(confusion_matrix(y_test,predictions))

[[1592    8]
 [  22  221]]


#### Classification Report

In [26]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1600
        spam       0.97      0.91      0.94       243

    accuracy                           0.98      1843
   macro avg       0.98      0.95      0.96      1843
weighted avg       0.98      0.98      0.98      1843



#### Accuracy Report 

In [27]:
print(accuracy_score(y_test,predictions))

0.9837221920781335


# Test by your own message

In [28]:
msg = input("Enter your message : ")
predict_msg = text_clf.predict([msg])
print(predict_msg)

Enter your message : You have won 30 million dollar. Send me your credit card details
['spam']
