# SMS Spam Detector or Spam Classifier Project

In [2]:
# nltk.download() #download if not available
import re
import nltk
import pandas as pd
from sklearn.externals import joblib


messages = pd.read_csv('SMSSpamCollection', sep='\t',names=["label", "message"]) # importing the Dataset.
                                                                                #dataset collected from,UCI machine learning repository sms spam 
                                                                                #Link: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
messages



Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Data cleaning and pre-processing for NLP

In [3]:

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

corpus = []
for i in range(0, len(messages)):   #since almost every sms is a individual sentance. so don't use the sentence tokenization
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i]) # removing all the punctuation except letters by space 
    review = review.lower() # lower case all the letter
    review = review.split() # So the same as the nltk.word_tokenize() do
    #print(review)
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    #print(review)
    corpus.append(review)
    
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

y=pd.get_dummies(messages['label'])
print('y lable data')
print(y)
y=y.iloc[:,1].values #removing one colum and present it two colum into one colum 
print('Updated dataset 1 mean spam 0 mean not spam: ',y)
  

y lable data
      ham  spam
0       1     0
1       1     0
2       0     1
3       1     0
4       1     0
...   ...   ...
5567    0     1
5568    1     0
5569    1     0
5570    1     0
5571    1     0

[5572 rows x 2 columns]
Updated dataset 1 mean spam 0 mean not spam:  [0 0 1 ... 0 0 0]


# Training and Testing 

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
#print('X test is ',X_test)
#print('y test is',y_test)
#print('X train is ',X_train)
#print('y train is',y_train)

# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)
print('Predected output is: ',y_pred) # in the predict output result, 0 mean not spam and 1 mean spam

Predected output is:  [0 1 0 ... 0 1 0]


# Find the accuracy of the model

In [5]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
print(accuracy)
# you can check the accuracy by changing the other model or algorithom.
#Aslo you can check the accuracy by changing the stemm by Lemmatization

0.9856502242152466


# Put you input and check if the SMS is spam or not spam

In [7]:
sms = input('Type your SMS and press enter: ') #Take sms input 
                            #Example SMS :Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 (Spam sms)
                            #Example SMS:Go until jurong point, crazy  (ham or not spam sms)
                            #or you can put any type of sms you want



data = [sms]
vect = cv.transform(data).toarray()   #convert SMS into vector
my_prediction = spam_detect_model.predict(vect)
#print(my_prediction)

import numpy as np #import numpy library to conver vector into scaler
val=np.asscalar(np.array(my_prediction)) #convert vector into scaler

if val==0:
    print('This is ham or not spam sms')
else:
    print('This is spam SMS')

Type your SMS and press enter: you win a credit card of 1000$
This is spam SMS
