In [28]:
import pandas as pd

In [29]:
df = pd.read_csv("https://raw.githubusercontent.com/sunnysavita10/Naive-Bayes/main/SpamClassifier-with-ML/sms_spam_data/SMSSpamCollection.csv", sep='\t', header=None, names=['label', 'messages'])

In [30]:
df.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
import nltk
import re

In [32]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Himz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
from nltk.corpus import stopwords

In [34]:
from nltk.stem.porter import PorterStemmer

In [35]:
ps = PorterStemmer()

## Clean Data with RE

In [36]:
corpus = []

In [37]:
for i in range (len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['messages'][i])     #apart from a-zA-Z we replace that thing with empty string
    review = review.lower()                                  # Lowercase all words to keep ascii same
    review = review.split()                                  # Split them into a list

    # Stemming -> Getting a root word , eg. Like is root word in likes, liked, liking, likely etc.
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]             #stopwords.words('english') exclude irrelvant common words
    review = ' '.join(review)
    corpus.append(review)

'''
We followed 4 steps to get our stemming done!
1. Removing unnecesary letters like . , etc
2. lower case everything
3. split to get a list with no additional space
4. Stemming

Lastly moving into a single list
'''

'\nWe followed 4 steps to get our stemming done!\n1. Removing unnecesary letters like . , etc\n2. lower case everything\n3. split to get a list with no additional space\n4. Stemming\n\nLastly moving into a single list\n'

In [39]:
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

### Now we will convert data into vector

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
cv = CountVectorizer()

In [42]:
X = cv.fit_transform(corpus).toarray()

In [44]:
X[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

> #### We converted our data into vector.

> #### We can use BOW (Bag of Words) which takes a set of unique words, then we find out our vectors.

In [45]:
X.shape

(5572, 6296)

> #### Now there are words (features) which are occuring rarely in any other line. Kind of unique or totally rare, so we can remove those features.

In [46]:
cv = CountVectorizer(max_features=2500)

In [47]:
X= cv.fit_transform(corpus).toarray()

In [49]:
# We have shrink our data to max 2500 columns (words) with most occurence
X.shape

(5572, 2500)

### Now we will encode our target column

In [52]:
pd.get_dummies(df['label'])[:5]

# We don't meed ham columns so we drop it

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [54]:
y = pd.get_dummies(df['label'], drop_first=True)

#### Now we will split data and train model

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

> ## GaussianNB

In [57]:
from sklearn.naive_bayes import GaussianNB

In [58]:
model = GaussianNB()

In [59]:
model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [61]:
y_pred = model.predict(X_test)

In [62]:
from sklearn.metrics import accuracy_score

In [63]:
accuracy_score(y_test, y_pred)

0.8506819813352476

> ## MultinomialNB

In [64]:
from sklearn.naive_bayes import MultinomialNB

In [65]:
model2 = MultinomialNB()

In [66]:
model2.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [67]:
y_pred2 = model2.predict(X_test)

In [68]:
accuracy_score(y_test, y_pred2)

0.9834888729361091