In [1]:
"""
@author: Mehul
"""

# importing the relevant libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
# importing the relevant libraries
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

# Creating class for spam classification with user-built functions for each stage
class spam_classifier:
    def __init__(self,data):
        self.data = data
    
    #Cleaning the data with removing extra spaces and other special characters
    def data_preprocessing(self):
        ps = PorterStemmer()
        lem = WordNetLemmatizer()
        corpus1 = []
        corpus2 = []
        for i in range(len(self.data)):
            self.data.columns = ['label','message']
            data_clean = re.sub('[^a-zA-Z]', ' ', self.data['message'][i])
            data_clean = data_clean.lower()
            data_clean = data_clean.split()
            data_stem = [ps.stem(word) for word in data_clean if not word in stopwords.words('english')]
            data_stem = ' '.join(data_stem)
            data_lem = [lem.lemmatize(word) for word in data_clean if not word in stopwords.words('english')]
            data_lem = ' '.join(data_lem)
            corpus1.append(data_stem)
            corpus2.append(data_lem)
        return corpus1,corpus2
    
    # Using bag of words (NLP) for conversion of categorical or textual data into a numerical matrix
    def bag_of_words_model(self):
        corpus1 = spam_classifier.data_preprocessing(self)[0]
        corpus2 = spam_classifier.data_preprocessing(self)[1]
        cv = CountVectorizer(max_features=2500)
        X1 = cv.fit_transform(corpus1).toarray()
        X2 = cv.fit_transform(corpus2).toarray()
        Y = pd.get_dummies(self.data['label'])
        Y=Y.iloc[:,1].values
        return X1,X2,Y
    
    # Splitting the above obtained numerical matrix into training and test set for input and labels
    def train_test_set_split(self):
        X1 = spam_classifier.bag_of_words_model(self)[0]
        X2 = spam_classifier.bag_of_words_model(self)[1]
        Y = spam_classifier.bag_of_words_model(self)[2]
        X1_train, X1_test, Y_train, Y_test = train_test_split(X1, Y, test_size = 0.20, random_state = 0)
        X2_train, X2_test, Y_train, Y_test = train_test_split(X2, Y, test_size = 0.20, random_state = 0)
        return X1_train, X2_train, X1_test, X2_test, Y_train, Y_test 
    
    # Training the model with decision tree, multinomial naive bayes and random forest classifiers
    def model_training(self): 
        X1_train, X2_train, X1_test, X2_test, Y_train = spam_classifier.train_test_set_split(self)[0:-1] 
        DT = DecisionTreeClassifier()
        MNB = MultinomialNB()
        RFC = RandomForestClassifier()
        classifiers = [DT,MNB,RFC]
        Classifiers = ['Decision tree classifier', 'Multinomial Naive Bayes classifier', 'Random Forest classifier']
        predictions_stem = []
        predictions_lem = []
        for classifier in classifiers:
            spamming_detection_model_stem = classifier.fit(X1_train, Y_train)
            spamming_detection_model_lem = classifier.fit(X2_train, Y_train)
            Y1_pred=spamming_detection_model_stem.predict(X1_test)
            Y2_pred=spamming_detection_model_lem.predict(X2_test)
            predictions_stem.append((Y1_pred,Classifiers[classifiers.index(classifier)]))
            predictions_lem.append((Y2_pred,Classifiers[classifiers.index(classifier)]))
        return predictions_stem, predictions_lem
    
    # Comparing the above trained models with classification report and confusion matrix metrics
    def accuracy_metrics_check(self):
        predictions_stem = spam_classifier.model_training(self)[0]
        predictions_lem = spam_classifier.model_training(self)[1]
        Y_test = spam_classifier.train_test_set_split(self)[5]
        for i in range(len(predictions_stem)):
            print(predictions_stem[i][1],' - Stemming','\n')
            print(classification_report(predictions_stem[i][0],Y_test),'\n')
            print('Confusion matrix', '\n',confusion_matrix(predictions_stem[i][0],Y_test),'\n')
        for i in range(len(predictions_lem)):
            print(predictions_lem[i][1],' - Lemmitization','\n')
            print(classification_report(predictions_lem[i][0],Y_test),'\n')
            print('Confusion matrix', '\n',confusion_matrix(predictions_lem[i][0],Y_test),'\n')

if __name__=='__main__':
    dataset = pd.read_csv('Downloads/archive (1)/spam.csv')
    dataset.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
    obj = spam_classifier(dataset)
    obj.accuracy_metrics_check()

Decision tree classifier  - Stemming 

              precision    recall  f1-score   support

           0       0.95      0.87      0.91      1035
           1       0.17      0.36      0.24        80

    accuracy                           0.83      1115
   macro avg       0.56      0.62      0.57      1115
weighted avg       0.89      0.83      0.86      1115
 

Confusion matrix 
 [[898 137]
 [ 51  29]] 

Multinomial Naive Bayes classifier  - Stemming 

              precision    recall  f1-score   support

           0       0.77      0.90      0.83       813
           1       0.52      0.28      0.37       302

    accuracy                           0.73      1115
   macro avg       0.65      0.59      0.60      1115
weighted avg       0.70      0.73      0.71      1115
 

Confusion matrix 
 [[733  80]
 [216  86]] 

Random Forest classifier  - Stemming 

              precision    recall  f1-score   support

           0       0.97      0.88      0.92      1045
           1      