In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('SMS_Data_/spam.csv',encoding='latin-1')
#dataset can be found from following link
#https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [3]:
messages = df.iloc[:,[0,1]]

In [4]:
messages.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
messages.columns=["Label","Text"]

In [6]:
messages.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
messages.shape

(5572, 2)

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [9]:
#nltk.download()

In [10]:
Lemmatizer = WordNetLemmatizer() 

In [11]:
corpus = []
for i in range(0,len(messages)):
    text = re.sub('[^a-zA-Z]', ' ', messages["Text"][i])
    #print(text)
    text = text.lower()
    #print(text)
    text = text.split()
    #print(text)
    text = [Lemmatizer.lemmatize(word) for word in text if not word in stopwords.words('english')]
    #print(text)
    text = ' '.join(text)
    #print(text)
    corpus.append(text)
    #break

In [12]:
len(corpus)

5572

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
countvectorizer = CountVectorizer(max_features=2626)

In [15]:
X = countvectorizer.fit(corpus)

In [16]:
X.get_feature_names()

['aah',
 'aathi',
 'abi',
 'abiola',
 'able',
 'abt',
 'abta',
 'ac',
 'acc',
 'accept',
 'access',
 'accident',
 'accidentally',
 'accordingly',
 'account',
 'ache',
 'across',
 'action',
 'activate',
 'activity',
 'actually',
 'ad',
 'add',
 'added',
 'addicted',
 'addie',
 'address',
 'admirer',
 'adore',
 'adult',
 'advance',
 'advice',
 'advise',
 'ae',
 'affair',
 'affection',
 'afraid',
 'aft',
 'afternoon',
 'aftr',
 'age',
 'ago',
 'ah',
 'aha',
 'ahead',
 'ahmad',
 'aight',
 'aint',
 'air',
 'airport',
 'aiya',
 'aiyah',
 'aiyar',
 'aiyo',
 'al',
 'album',
 'alcohol',
 'alert',
 'alex',
 'alfie',
 'alive',
 'allah',
 'almost',
 'alone',
 'along',
 'already',
 'alright',
 'alrite',
 'also',
 'always',
 'alwys',
 'amazing',
 'american',
 'among',
 'amount',
 'amp',
 'amt',
 'an',
 'angry',
 'announcement',
 'another',
 'ansr',
 'answer',
 'answering',
 'anthony',
 'anti',
 'anybody',
 'anymore',
 'anyone',
 'anythin',
 'anything',
 'anytime',
 'anyway',
 'anyways',
 'anywhere',

In [17]:
X1 = countvectorizer.transform(corpus).toarray()

In [18]:
pd.DataFrame(X1, columns=X.get_feature_names())

Unnamed: 0,aah,aathi,abi,abiola,able,abt,abta,ac,acc,accept,...,yo,yoga,yogasana,yor,yr,yummy,yun,yuo,yup,zed
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
y = pd.get_dummies(messages["Label"])

In [20]:
y=y.iloc[:,1]

In [21]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: spam, Length: 5572, dtype: uint8

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X1,y,test_size=1/3,random_state=2)

In [24]:
model = MultinomialNB()

In [25]:
model.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
pred = model.predict(X_test)

In [27]:
accuracy_score(y_test,pred)

0.9736275565123789

In [28]:
pred[:5]

array([0, 0, 0, 0, 0], dtype=uint8)

In [29]:
y_test[:5]

5086    0
2120    0
2318    0
2917    0
1352    0
Name: spam, dtype: uint8