In [1]:
# import the relevant libraries
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# loading the data
df = pd.read_csv(r'F:\Full Stack Data Science and AI Naresh-IT\NLP\SMSSpamCollection', sep = '\t', names = ['label', 'message'])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# data understanding
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### Text cleaning: remove punctuations, remove stopwords, stemming

In [4]:
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
ps = PorterStemmer()

corpus = []

for i in range(len(df)):
    s = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    s = s.lower()
    s = s.split()
    s = [ps.stem(word) for word in s if word not in stopwords.words('english')]
    s = ' '.join(s)
    corpus.append(s)


### Vectorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [6]:
y = pd.get_dummies(df['label'], drop_first = True)

#### Train-test-split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 0)

#### Modelling: Naive Bayes

In [8]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [9]:
# evaluations
ypred_train = nb.predict(X_train)
ypred_test = nb.predict(X_test)

from sklearn.metrics import accuracy_score
print('train: ', accuracy_score(y_train, ypred_train))
print('test: ', accuracy_score(y_test, ypred_test))

from sklearn.model_selection import cross_val_score
print('cv score: ', cross_val_score(nb, X_train, y_train, cv = 5, scoring = 'accuracy').mean())

train:  0.9921471842046219
test:  0.979372197309417
cv score:  0.9777885984911396
