# Miniproject : SMS SPAM/HAM Prediction 
Group Members are : BECOC305, BECOC306, BECOC311, BECOC320.

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
df= pd.read_csv('smsspamcollection.tsv',sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
df.shape

(5572, 4)

In [5]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

# Data Cleaning and Preprocessing
1) Removing punctuation marks <br>
2) Tokenizing the data <br>
2) Removing special symbols from text <br>
3) Converting all text to Lowercase <br>

In [7]:
import re
import nltk
#nltk.download('stopwords') #stopwords contains all irrelevent words from aspect of prediction
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #creates simple form of word without any participles

corpus = []
msglabel = []
for i in range(0, 5572):
    msg = re.sub('[^a-zA-Z0-9]', ' ', df['message'][i]) # ' ' is given so that removed char will be replaced by 'space'  #^ == NOT #in sub() specify what you dont want to remove from review
    msg = msg.lower()
    msg = msg.split()
    ps = PorterStemmer()
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    msg = ' '.join(msg)
    corpus.append(msg)
    msglabel.append(df['label'][i])

In [8]:
# print(corpus)
# corpus list contains all messages
print(len(corpus))

5572


In [9]:
# masglabel list contains corresponding msg labels.
#print(msglabel)
print(len(msglabel))

5572


In [10]:
final_df = pd.DataFrame({'Msg':corpus,'Label':msglabel})
final_df.head()

Unnamed: 0,Msg,Label
0,go jurong point crazi avail bugi n great world...,ham
1,ok lar joke wif u oni,ham
2,free entri 2 wkli comp win fa cup final tkt 21...,spam
3,u dun say earli hor u c alreadi say,ham
4,nah think goe usf live around though,ham


In [11]:
# Here we will extract features by using Countvecterizer or TFIDFVecterizer 

# As per university guidelines Classification Algorithms that we will be trying are as follows :
1) Logistic Regression <br>
2) Decision Tree <br>
3) RandomForest <br>
4) SVM <br>
5) NaiveBayes <br>
6) KNN <br>

In [12]:
from sklearn.model_selection import train_test_split

X = final_df['Msg']  # this time we want to look at the text
y = final_df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
X_train_tfidf.shape

(3733, 5907)

In [14]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [15]:
from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC
msg_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])
# Feed the training data through the pipeline
msg_clf.fit(X_train, y_train)  

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [16]:
# Form a prediction set
predictions = msg_clf.predict(X_test)

In [17]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1589    4]
 [  16  230]]


In [18]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.9891245241979336


In [19]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.98      0.93      0.96       246

    accuracy                           0.99      1839
   macro avg       0.99      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [20]:
# Predict for user input
predictions = msg_clf.predict(['Last weekends draw shows that you won a £1000 prize GUARANTEED.'])
print(predictions)

['spam']


In [24]:
# Predict for user input
predictions = msg_clf.predict(['Hello. Can you send me the 5th assignment, please. I will give you $1000'])
print(predictions)

['ham']


In [22]:
# Predict for user input
predictions = msg_clf.predict(['Should we hangout tommorow?'])
print(predictions)

['ham']


In [25]:
pickle.dump(msg_clf, open('mypipeline.pkl', 'wb'))

In [26]:
new_msg_clf = pickle.load(open('mypipeline.pkl', 'rb'))

In [27]:
predictions = new_msg_clf.predict(['Should we hangout tommorow?'])
print(predictions)

['ham']
