## **Importing Necessary Libraries**

In [129]:
import pandas as pd
import re
import nltk  # Natural Language Tool Kit
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # For stemming

nltk.download('wordnet') #For Lemmetization
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## **Importing Data**

In [130]:
dataset = pd.read_csv('SMSSpamCollection',sep = '\t',names=['labels','messages'], index_col=None)
dataset

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## **Text Preprcocessing**

In [131]:
ps = PorterStemmer()
lm = WordNetLemmatizer()

corpus = []
for i in range(0,len(dataset)):
    review = re.sub('[^a-zA-z]','',dataset['messages'][i])
    review  = review.lower()
    review  = review.split()
    #review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review  = [lm.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [132]:
corpus

['gountiljurongpointcrazyavailableonlyinbugisngreatworldlaebuffetcinetheregotamorewat',
 'oklarjokingwifuoni',
 'freeentryinawklycomptowinfacupfinaltktsstmaytextfatotoreceiveentryquestionstdtxtratetcsapplyovers',
 'udunsaysoearlyhorucalreadythensay',
 'nahidontthinkhegoestousfhelivesaroundherethough',
 'freemsgheytheredarlingitsbeenweeksnowandnowordbackidlikesomefunyouupforitstilltbokxxxstdchgstosendtorcv',
 'evenmybrotherisnotliketospeakwithmetheytreatmelikeaidspatent',
 'asperyourrequestmellemelleoruminnaminungintenurunguvettamhasbeensetasyourcallertuneforallcallerspresstocopyyourfriendscallertune',
 'winnerasavaluednetworkcustomeryouhavebeenselectedtoreceiveaprizerewardtoclaimcallclaimcodeklvalidhoursonly',
 'hadyourmobilemonthsormoreurentitledtoupdatetothelatestcolourmobileswithcameraforfreecallthemobileupdatecofreeon',
 'imgonnabehomesoonandidontwanttotalkaboutthisstuffanymoretonightkivecriedenoughtoday',
 'sixchancestowincashfromtopoundstxtcshandsendtocostpdaydaystsandcsapplyrepl

##**Model Building** - Vectorization and I/p and O/p

In [133]:
cv = CountVectorizer(max_features=2500)
x= cv.fit_transform(corpus).toarray()
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [134]:
le = LabelEncoder()
dataset['labels'] = le.fit_transform(dataset['labels'])

In [135]:
y = dataset[['labels']]
y

Unnamed: 0,labels
0,0
1,0
2,1
3,0
4,0
...,...
5567,1
5568,0
5569,0
5570,0


In [136]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.20,random_state = 12)

## **Model Training**

In [137]:
nb_classifier = MultinomialNB()
nb_classifier.fit(x_train,y_train)

MultinomialNB()

## **Model Testing**

In [138]:
y_pred = nb_classifier.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

## **Model Evaluation**

In [139]:
print('Accuracy Score  :',  accuracy_score(y_test,y_pred))
print('Precision Score  :',  precision_score(y_test,y_pred))
print('Confusion Matrix :\n',  confusion_matrix(y_test,y_pred))

Accuracy Score  : 0.8591928251121076
Precision Score  : 1.0
Confusion Matrix :
 [[957   0]
 [157   1]]
