### Spam Detection With Machine Learning - Kavya Mukkamala

In [None]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
#load dataset
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('Youtube05-Shakira.csv')

Saving Youtube05-Shakira.csv to Youtube05-Shakira (4).csv


In [None]:
#displays first 5 rows of the dataframe
#helps us gain an understanding of what our dataframe looks like, and how we may need to adjust it
df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z13lgffb5w3ddx1ul22qy1wxspy5cpkz504,dharma pal,2015-05-29T02:30:18.971000,Nice song﻿,0
1,z123dbgb0mqjfxbtz22ucjc5jvzcv3ykj,Tiza Arellano,2015-05-29T00:14:48.748000,I love song ﻿,0
2,z12quxxp2vutflkxv04cihggzt2azl34pms0k,Prìñçeśś Âliś Łøvê Dømíñø Mâđiś™ ﻿,2015-05-28T21:00:08.607000,I love song ﻿,0
3,z12icv3ysqvlwth2c23eddlykyqut5z1h,Eric Gonzalez,2015-05-28T20:47:12.193000,"860,000,000 lets make it first female to reach...",0
4,z133stly3kete3tly22petvwdpmghrlli,Analena López,2015-05-28T17:08:29.827000,shakira is best for worldcup﻿,0


In [None]:
#provides basic statistics about the data
df.describe()

Unnamed: 0,CLASS
count,370.0
mean,0.47027
std,0.499791
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
#There are 370 instances and 5 columns
df.shape

(370, 5)

In [None]:
#prints content and class of 10 random instances in the dataset
print(df.sample(10))

                                      COMMENT_ID  \
42           z13iuzxghsypjtfwy04ccnewbqbde3qwea0   
280  _2viQ_Qnc6_8ffRCQd5tgg-gWMV557Vkqa4Yz1P4m8s   
301  _2viQ_Qnc68kPR6lRkhBHXUX2dGt04-4RgzINpv8Yhk   
234  _2viQ_Qnc68MKhLnK71z12gu878i_A0sdfmpA0RvgOE   
304  _2viQ_Qnc6-m9RqGULP5B8P1SH2WqvKu3TnN7rBk2CI   
226  _2viQ_Qnc6_ahzf6NP9Anh6ef2byWNEcrFHd9swO25s   
367  _2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs   
345  _2viQ_Qnc69mci30y5muwQXNMaeCmIvZ4ca8l_4zPmA   
51           z12bfraboyajftgbz04ccbkr3xjxfxyxsew   
194  _2viQ_Qnc6-grcnVFTtwnhvC9tpgVG33p5a0AZHKlLI   

                                 AUTHOR                        DATE  \
42                     Сергей Андреевич  2015-05-25T18:00:22.486000   
280                      Faisal Alqarny  2013-09-05T21:46:13.297000   
301  Ando Nesia - | MC | Music Producer  2013-08-25T04:22:28.183000   
234                            5000palo  2013-10-02T13:45:33.782000   
304                    henry Richardson  2013-08-24T05:53:21.519000   
2

In [None]:
#since the CommentID, Author, and Date columns aren't relevant to the program, the dataframe can be revised to not include them
df = df[["CONTENT", "CLASS"]]
print(df.sample(10))

                                               CONTENT  CLASS
175  Hey, have you tried &quot;DribblePROshot&quot;...      1
293  How did you know that people makes another acc...      1
289  do you want to make some easy money? check out...      1
205  Hello Guys...I Found a Way to Make Money Onlin...      1
154              Pleas subscribe my channel GamezZMTA﻿      1
11                  Why so many disliked??????!!!!!!😯﻿      0
163  i am from Brazil please subscribe my channel l...      1
220                                          like!!!!!      0
114                                              Like﻿      0
358  Hey Music Fans I really appreciate all of you ...      1


In [None]:
#the information along with the data set indicates that class 0 means not spam, while class 1 means spam
#to make the data easier to understand, class 0 will be changed to read "Not Spam" and class 1 will be changed to read "Spam"
df["CLASS"] = df["CLASS"].map({0: "Not Spam", 1: "Spam"})
#prints content and class of 10 random instances in the dataset
print(df.sample(10))

                                               CONTENT     CLASS
311                                      adf.ly /KlD3Y      Spam
349                                  Love this song!!!  Not Spam
39                                          fave song﻿  Not Spam
48                                              goood﻿  Not Spam
316                      subscribe to my pagee please.      Spam
16            Whose watching this in 2015. If so hi-5﻿  Not Spam
119                                          love!!!!﻿  Not Spam
281               god she is so sexy! drives me crazy!  Not Spam
167  New way to make money easily and spending 20 m...      Spam
346   hey you ! check out the channel of Alvar Lake !!      Spam


In [None]:
#creates dataframes a and b for the content and class lists, respectively
a = df["CONTENT"].values
b = df["CLASS"].values

This dataset is in a binary distribution-- a comment is either classified as "Spam" or "Not Spam". Therefore, the best classifier to use in the case would be Bernoulli Naive Bayes.

In [None]:
#Bernoulli Naive Bayes is used for binary classification
#The content values are not binary, so it will be altered with CountVectorizer()
#CountVectorizer() transforms text to a vector
vectorizer = CountVectorizer()
a = (vectorizer.fit_transform(a)).toarray()
print(a)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
#splits the data into 80% training and 20% testing
trainingA = a[:296]
trainingB = b[:296]

testingA = a[296:]
testingB = b[296:]

In [None]:
#sets threshold for binarizing
BNB = BernoulliNB(binarize = 0)
#fits Bernoulli Naive Bayes classifier according to a (content) and b (class)
BNB.fit(trainingA, trainingB)

trainingB_expect = testingB
trainingB_prediction = BNB.predict(testingA)
#prints accuracy score
print (accuracy_score(trainingB_expect, trainingB_prediction))

0.6216216216216216


In [None]:
#from training data
b_pred_train= BNB.predict(trainingA)
#from testing data
b_pred_test = BNB.predict(testingA)

#prints classification report
print(classification_report(trainingB, b_pred_train))

              precision    recall  f1-score   support

    Not Spam       0.75      1.00      0.86       182
        Spam       1.00      0.46      0.63       114

    accuracy                           0.79       296
   macro avg       0.87      0.73      0.75       296
weighted avg       0.85      0.79      0.77       296



In [None]:
#prints classification report
print(classification_report(testingB, b_pred_test))

              precision    recall  f1-score   support

    Not Spam       0.33      1.00      0.50        14
        Spam       1.00      0.53      0.70        60

    accuracy                           0.62        74
   macro avg       0.67      0.77      0.60        74
weighted avg       0.87      0.62      0.66        74

