Building a spam classifier using Naive Bayes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import nltk
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
#Importing the CountVectorizer to convert raw text to numbers
from sklearn.feature_extraction.text import CountVectorizer
#Importing the confusion matrix methods to check the performance of the model and visualise it.
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score 
import seaborn as sns

nltk.download('stopwords')
nltk.download('punkt')

Loading data

In [None]:
def loadData(path,sep,skiprows):
    data=pd.read_csv(path,sep,names=['type', 'text'], 
                 skiprows=skiprows)
    return data
#Normalizing all datasets with the same header
data1=loadData("TrainDataset1.csv",",",1)
data2=loadData("TrainDataset2.csv",",",1)
data3=loadData("TrainDataset3.txt","	",0)
print("\t-------------TrainDataset1-------------\n")
print(data1)
print("\t-------------TrainDataset2-------------\n")
print(data2)
print("\t-------------TrainDataset3-------------\n")
print(data3)

Preprocessing the data

In [None]:
def preProcessing(dataTraining,data):
    #extract text from dataset
    texts=list(data[data.columns[-1]])
    #make all words in lowerCase 
    texts=list(map(str.lower, texts))
    
    #remove punctuation
    for i in range(len(texts)):
        texts[i] = re.sub(r'[^\w\s]',' ',texts[i])
        
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    for i in range(len(texts)):
        word_tokens = word_tokenize(texts[i])
        myList = [w for w in word_tokens if not w in stop_words]
        texts[i]=" ".join(myList)
            
    #creat a new data with clean messages ( no punctuation no stopwords)
    columns=[data.columns[0], data.columns[1]]
    cleanData = pd.DataFrame(columns=columns)
    cleanData[data.columns[0]] = list(data[data.columns[0]])
    cleanData[data.columns[1]] = texts
    
    #Add the new data to the bigData for the training
    dataTraining= pd.concat([dataTraining,cleanData],ignore_index=True)
        
    return dataTraining

#creating and initializing the bigData for the training
columns=['type', 'text']
dataTraining = pd.DataFrame(columns=columns)
dataTraining = dataTraining.fillna(0)

#Combine the 3 dataset into one big dataset for the training.
dataTraining= preProcessing(dataTraining,data1)
dataTraining= preProcessing(dataTraining,data2)
dataTraining= preProcessing(dataTraining,data3)
dataTraining

Visualising the data

In [None]:
#Visualise the data

#function that count the number of each ham word that exist within our bigData
def count_words_ham(texts,types):
    data={}

    for i in range(len(texts)):
        if types[i]=="ham":
            text=texts[i] 
            listOfWords=text.split(" ")
            for k in range(len(listOfWords)):
                if listOfWords[k] in data:
                    data[listOfWords[k]]=data[listOfWords[k]]+1
                else:
                    data[listOfWords[k]]=1
        else:
            continue
    return data


#function that count the number of each ham word that exist within our bigData
def count_words_spam(texts,types):
    data={}

    for i in range(len(texts)):
        if types[i]=="spam":
            text=texts[i] 
            listOfWords=text.split(" ")
            for k in range(len(listOfWords)):
                if listOfWords[k] in data:
                    data[listOfWords[k]]=data[listOfWords[k]]+1
                else:
                    data[listOfWords[k]]=1
        else:
            continue
    return data

data_ham=count_words_ham(dataTraining[dataTraining.columns[-1]].tolist(),dataTraining[dataTraining.columns[0]].tolist())                    
data_spam=count_words_spam(dataTraining[dataTraining.columns[-1]].tolist(),dataTraining[dataTraining.columns[0]].tolist())



#sort the dictionnary 
data_ham= sorted(data_ham.items(), key=lambda x: x[1], reverse=True)
data_spam= sorted(data_spam.items(),key=lambda x: x[1], reverse=True)


#we want to visualize the 8 most repeated words in spam and ham data
dataToVisualize_ham={}
dataToVisualize_spam={}

#copy elements of a dictionary in another dict for ham
def copy(numOfElement,data,dataToVisualize):
    k=0
    for i in data:
        if k<numOfElement:
            dataToVisualize[i[0]]=i[1]
            k+=1 
        else:break 
    
    return dataToVisualize

dataToVisualize_ham=copy(8,data_ham,dataToVisualize_ham)
dataToVisualize_spam=copy(8,data_spam,dataToVisualize_spam)    
    

#exploring the dataset
print("----exploring the dataset----\n")
print(dataTraining['type'].value_counts())

#plotting
width = 0.30  # the width of the bars
plt.bar(dataToVisualize_ham.keys(),dataToVisualize_ham.values(), width,color="r")
plt.title("ham",color="r")
plt.show()

plt.bar(dataToVisualize_spam.keys(),dataToVisualize_spam.values(), width,color="g")
plt.title("spam",color="g")
plt.show()

dataTraining[dataTraining.columns[0]].hist(bins=2)

Building, training and validating the classifer

In [None]:
# Build, train and validate the classifier,

X=dataTraining[dataTraining.columns[-1]].tolist()
Y= dataTraining[dataTraining.columns[0]].tolist()

#split the data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

#initialize the vectorizer
vectorizer=CountVectorizer()

#initialize the model
clf = MultinomialNB()

#function to do the training,the prediction and evaluate the model
def doAll(clf,vectorizer,step,x_train,x_test,y_train,y_test):
    if(step=="step4"):
        x_train = vectorizer.fit_transform(x_train)
        x_test = vectorizer.transform(x_test)
        """
           Making an object of the MultinomialNB class followed by 
           fitting the classifier object on X_train and y_train data.
            .toarray() with X_train is used to 
           convert a sparse matrix to a dense matrix.
        """   
        
        clf.fit(x_train.toarray(), y_train)
        #Predicting the test set results
        y_pred = clf.predict(x_test.toarray())


        """ Checking the performance of the model """
        #Making the Confusion Matrix and classification_report
        cm = confusion_matrix(y_test, y_pred)
        cr=classification_report(y_test, y_pred)
        ac=accuracy_score(y_test, y_pred)

        return cr,cm,ac
       
    
    elif(step=="step5"):
        #vectorize the data
        x_test = vectorizer.transform(x_test)
        #prediction
        y_pred_SMS = clf.predict(x_test.toarray())
        
        """ Checking the performance of the model """
        #Making the Confusion Matrix
        cmSMS = confusion_matrix(y_test_SMS,y_pred_SMS)
        crSMS=classification_report(y_test_SMS, y_pred_SMS)
        acSMS=accuracy_score(y_test_SMS, y_pred_SMS)
        
        return crSMS,cmSMS,acSMS

cr,cm,ac=doAll(clf,vectorizer,"step4",x_train,x_test,y_train,y_test)        

""" Visualisation """
def visualFunction(ac,cr,cm,color):
    print ("\t\t----------Accuracy Score ----------\nAccuracy=%.1f"% (ac*100),"%")

    print("\n\t\t----------Report----------\n")
    print(cr)
    #Visualising the Confusion Matrix
    print("\n\t\t------Confusion Matrix------\n")
    print(cm)

    #heatmap for confusion matrix
    print("\n\t\t------heatmap for confusion matrix------\n")

    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in  zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cm, annot=labels, fmt='', cmap=color)
    return

visualFunction(ac,cr,cm,'Reds')

supervised classification

In [None]:
# Test the classifier

#load the data
data4=loadData("SMSSpamCollection.txt","	",0)
columns=['type', 'text']
SMSData = pd.DataFrame(data4.values.tolist(),columns=columns)
y_test_SMS=SMSData[SMSData.columns[0]].tolist()
x_test_SMS=SMSData[SMSData.columns[-1]].tolist()

crSMS,cmSMS,acSMS=doAll(clf,vectorizer,"step5",None,x_test_SMS,None,y_test_SMS)

"""visualization"""
visualFunction(acSMS,crSMS,cmSMS,'terrain')

Unsupervised classification

In [None]:
# Test the classifier
def unsupPred(x_dataTest):
    columns=['type','text']
    unsupPredData = pd.DataFrame(columns=columns)
    unsupPredData[unsupPredData.columns[1]] = x_dataTest
    #vectorize the data
    x_dataTest = vectorizer.transform(x_dataTest)
    #prediction
    y_pred_dataTest = clf.predict(x_dataTest.toarray())

    #convert list to dataframe
    
    unsupPredData[unsupPredData.columns[0]] = y_pred_dataTest
    
    return unsupPredData
#load data
data5=pd.read_csv("TestDataset.csv",names=['type'],skiprows=1)
x_unspData=data5[data5.columns[0]]

unsupPredData=unsupPred(x_unspData)
print("\n\t------TestDataset with predicted labels------\n")
print(unsupPredData)

#exploring the dataset
print("\n\n\t----exploring the predTestDataset----\n")
print(unsupPredData['type'].value_counts())

unsupPredData[unsupPredData.columns[0]].hist(bins=2)

Cheating the classifier

In [None]:
# Cheat the classifier
data6=pd.read_csv("TestDataset.csv",names=['type'],skiprows=1)
newRows=[["Since our debut a year ago more than a million people have joined our community"],
         ["Oh! no share Market has fallen down by $100,000 due to Corona outbreak... "],
         ["ok, got ur call"],
         ["get a free call"],
         ["hi, u get text 4 free"]]

data6 = data6.append(pd.DataFrame(newRows, columns=['type']),ignore_index=True)

newRows=data6[data6.columns[0]]
newRows = vectorizer.transform(newRows)
y_pred_new = clf.predict(newRows.toarray())
print("\n\t----Predection of the new 5 sentences----\n")
print(y_pred_new[-5:])

#exploring the dataset
columns=['type']
cheatPredData = pd.DataFrame(y_pred_new,columns=columns)
print("\n\t----exploring the cheatPredDataset----\n")
print(cheatPredData['type'].value_counts())
cheatPredData[cheatPredData.columns[0]].hist(bins=2)