### Importing the dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
messages = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

In [3]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Cleaning and Preprocessing

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [5]:
ps = PorterStemmer()
lr = WordNetLemmatizer()

In [6]:
def preprocess(messages):
    
    corpus = []
    for i in range(len(messages)):
        review = re.sub('[^a-zA-Z]',' ', messages['message'][i])
        review = review.lower()
        review = review.split()
        review = [lr.lemmatize(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [7]:
corpus = preprocess(messages)

In [8]:
corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though']

### Creating TF-IDF model

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tf = TfidfVectorizer(max_features=2500)

In [11]:
X = tf.fit_transform(corpus).toarray()
y = pd.get_dummies(messages['label'])
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [12]:
y = y.iloc[:,1].values
y[:5]

array([0, 0, 1, 0, 0], dtype=uint8)

### Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

### Training model using Naive Bayes Classifier

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [17]:
y_pred = spam_detect_model.predict(X_test)

### Calculating accuracy

In [18]:
from sklearn.metrics import confusion_matrix

In [19]:
confusion_m = confusion_matrix(y_test, y_pred)

In [20]:
confusion_m

array([[964,   2],
       [ 18, 131]], dtype=int64)

In [21]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.88      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### Testing the model

In [23]:
df = pd.DataFrame({"message":["Free tones Hope you enjoyed your new content",
                             "No. I meant the calculation is the same. That I'll call later",
                             "Had your contract mobile 11 Mnths? Latest Motorola Now",
                             "WINNER!! You just won a free ticket to Bahamas. Send your Details"]})

integers = tf.transform(preprocess(df))

In [24]:
df['label'] = spam_detect_model.predict(integers)

In [25]:
df['label'] = np.where(df['label']==1, 'spam', 'ham')
df

Unnamed: 0,message,label
0,Free tones Hope you enjoyed your new content,spam
1,No. I meant the calculation is the same. That ...,ham
2,Had your contract mobile 11 Mnths? Latest Moto...,spam
3,WINNER!! You just won a free ticket to Bahamas...,spam
