<h1>Build a Spam Detection Filter</h1>

In [1]:
# Import the findspark library to locate Spark installation
import findspark

# Initialize Spark with the specified Spark installation path
findspark.init('/home/mina/python-spark/spark-3.4.0-bin-hadoop3/')


# Import the pyspark library
import pyspark

# Import the SparkSession class from pyspark.sql
from pyspark.sql import SparkSession

# Create a Spark session with the name 'NLP_Project'
spark = SparkSession.builder.appName('NLP_Project').getOrCreate()

# Read a CSV file ('smsspamcollection/SMSSpamCollection') into a DataFrame,
# inferring the schema from the data, and specifying the tab ('\t') as the separator
data = spark.read.csv('smsspamcollection/SMSSpamCollection', inferSchema = True , sep ='\t')

# Display the contents of the DataFrame
data.show()

23/09/18 10:32:18 WARN Utils: Your hostname, mina-VirtualBox resolves to a loopback address: 127.0.1.1; using 192.168.1.143 instead (on interface enp0s3)
23/09/18 10:32:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/18 10:32:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [2]:
# Rename the '_c0' column to 'class' and '_c1' column to 'text' in the DataFrame 'data'
data_main = data.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')

# Display the contents of the DataFrame 'data_main'
data_main.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [3]:
# Import the length function from pyspark.sql.functions
from pyspark.sql.functions import length

# Add a new column 'length' to the DataFrame 'data_main' containing the length of the 'text' column
data_main = data_main.withColumn('lenght', length(data_main['text']))

# Display the contents of the updated DataFrame 'data_main'
data_main.show()

+-----+--------------------+------+
|class|                text|lenght|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



In [4]:
# Group the DataFrame 'data_main' by the 'class' column and calculate the mean for each group
# Then, display the resulting DataFrame
data_main.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(lenght)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [5]:
# Import necessary classes from pyspark.ml.feature
from pyspark.ml.feature import (Tokenizer,StopWordsRemover,CountVectorizer, IDF,
                                StringIndexer)

# Create a Tokenizer object to tokenize the 'text' column and output the result to the 'words' column
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Transform the DataFrame 'data_main' using the Tokenizer
data_tokeniz = tokenizer.transform(data_main)

# Display the DataFrame with the 'words' column containing tokenized text
data_tokeniz.show()

+-----+--------------------+------+--------------------+
|class|                text|lenght|               words|
+-----+--------------------+------+--------------------+
|  ham|Go until jurong p...|   111|[go, until, juron...|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|
|  ham|Nah I don't think...|    61|[nah, i, don't, t...|
| spam|FreeMsg Hey there...|   147|[freemsg, hey, th...|
|  ham|Even my brother i...|    77|[even, my, brothe...|
|  ham|As per your reque...|   160|[as, per, your, r...|
| spam|WINNER!! As a val...|   157|[winner!!, as, a,...|
| spam|Had your mobile 1...|   154|[had, your, mobil...|
|  ham|I'm gonna be home...|   109|[i'm, gonna, be, ...|
| spam|SIX chances to wi...|   136|[six, chances, to...|
| spam|URGENT! You have ...|   155|[urgent!, you, ha...|
|  ham|I've been searchi...|   196|[i've, been, sear...|
|  ham|I HAVE A DATE ON ...|   

In [6]:
# Import necessary functions and types from the PySpark SQL library
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

# Create a User Defined Function (UDF) to count the number of words in a column
count_token = udf(lambda words:len(words), IntegerType())

# Add a new column 'length_words' to the DataFrame 'data_tokeniz' using the UDF
data_tokeniz.withColumn('lenght_words',count_token(col('words'))).show()

+-----+--------------------+------+--------------------+------------+
|class|                text|lenght|               words|lenght_words|
+-----+--------------------+------+--------------------+------------+
|  ham|Go until jurong p...|   111|[go, until, juron...|          20|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|           6|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|          28|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|          11|
|  ham|Nah I don't think...|    61|[nah, i, don't, t...|          13|
| spam|FreeMsg Hey there...|   147|[freemsg, hey, th...|          32|
|  ham|Even my brother i...|    77|[even, my, brothe...|          16|
|  ham|As per your reque...|   160|[as, per, your, r...|          26|
| spam|WINNER!! As a val...|   157|[winner!!, as, a,...|          26|
| spam|Had your mobile 1...|   154|[had, your, mobil...|          29|
|  ham|I'm gonna be home...|   109|[i'm, gonna, be, ...|          21|
| spam|SIX chances t

[Stage 9:>                                                          (0 + 1) / 1]                                                                                

In [7]:
# Create a StopWordsRemover object, specifying the input and output columns
remover = StopWordsRemover(inputCol='words' , outputCol='StopWords')

# Use the StopWordsRemover to transform the 'data_tokeniz' DataFrame and remove stop words
remover_StopWord = remover.transform(data_tokeniz)

# Show the resulting DataFrame after removing stop words
remover_StopWord.show()

+-----+--------------------+------+--------------------+--------------------+
|class|                text|lenght|               words|           StopWords|
+-----+--------------------+------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|[go, until, juron...|[go, jurong, poin...|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|[ok, lar..., joki...|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|[free, entry, 2, ...|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|[u, dun, say, ear...|
|  ham|Nah I don't think...|    61|[nah, i, don't, t...|[nah, think, goes...|
| spam|FreeMsg Hey there...|   147|[freemsg, hey, th...|[freemsg, hey, da...|
|  ham|Even my brother i...|    77|[even, my, brothe...|[even, brother, l...|
|  ham|As per your reque...|   160|[as, per, your, r...|[per, request, 'm...|
| spam|WINNER!! As a val...|   157|[winner!!, as, a,...|[winner!!, valued...|
| spam|Had your mobile 1...|   154|[had, your, mobil...|[mobile,

In [8]:
# Create a User Defined Function (UDF) to count the number of words in the 'StopWords' column
count_token1 = udf(lambda StopWords:len(StopWords), IntegerType())

# Add a new column 'lenght_StopWords' to the 'remover_StopWord' DataFrame using the UDF
remover_StopWord.withColumn('lenght_StopWords',count_token1(col('StopWords'))).show()

+-----+--------------------+------+--------------------+--------------------+----------------+
|class|                text|lenght|               words|           StopWords|lenght_StopWords|
+-----+--------------------+------+--------------------+--------------------+----------------+
|  ham|Go until jurong p...|   111|[go, until, juron...|[go, jurong, poin...|              16|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|[ok, lar..., joki...|               6|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|[free, entry, 2, ...|              23|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|[u, dun, say, ear...|               9|
|  ham|Nah I don't think...|    61|[nah, i, don't, t...|[nah, think, goes...|               7|
| spam|FreeMsg Hey there...|   147|[freemsg, hey, th...|[freemsg, hey, da...|              18|
|  ham|Even my brother i...|    77|[even, my, brothe...|[even, brother, l...|               9|
|  ham|As per your reque...|   160|[as, per, your,

In [9]:
# Create a CountVectorizer object, specifying the input and output columns
count_vc = CountVectorizer(inputCol='StopWords', outputCol='c_vec')

# Fit the CountVectorizer model on the 'remover_StopWord' DataFrame
model = count_vc.fit(remover_StopWord)

# Transform the 'remover_StopWord' DataFrame using the fitted CountVectorizer model
data_ConVec = model.transform(remover_StopWord)

# Show the resulting DataFrame with the CountVectorizer output
data_ConVec.show()

+-----+--------------------+------+--------------------+--------------------+--------------------+
|class|                text|lenght|               words|           StopWords|               c_vec|
+-----+--------------------+------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,301,...|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|
|  ham|Nah I don't think...|    61|[nah, i, don't, t...|[nah, think, goes...|(13423,[36,134,31...|
| spam|FreeMsg Hey there...|   147|[freemsg, hey, th...|[freemsg, hey, da...|(13423,[10,60,140...|
|  ham|Even my brother i...|    77|[even, my, brothe...|[even, brother, l...|(13423,[10,53,102...|
|  ham|As 

In [10]:
# Create an IDF (Inverse Document Frequency) object, specifying the input and output columns
idf = IDF(inputCol='c_vec', outputCol='idf')

# Fit the IDF model on the 'data_ConVec' DataFrame
idf_model = idf.fit(data_ConVec)

# Transform the 'data_ConVec' DataFrame using the fitted IDF model
data_idf = idf_model.transform(data_ConVec)

# Select and display the 'idf' column from the resulting DataFrame
data_idf.select('idf').show()

                                                                                

+--------------------+
|                 idf|
+--------------------+
|(13423,[7,11,31,6...|
|(13423,[0,24,301,...|
|(13423,[2,13,19,3...|
|(13423,[0,70,80,1...|
|(13423,[36,134,31...|
|(13423,[10,60,140...|
|(13423,[10,53,102...|
|(13423,[127,185,4...|
|(13423,[1,47,121,...|
|(13423,[0,1,13,27...|
|(13423,[18,43,117...|
|(13423,[8,16,37,8...|
|(13423,[13,30,47,...|
|(13423,[39,95,221...|
|(13423,[555,1797,...|
|(13423,[30,109,11...|
|(13423,[82,214,44...|
|(13423,[0,2,49,13...|
|(13423,[0,74,105,...|
|(13423,[4,30,33,5...|
+--------------------+
only showing top 20 rows



In [11]:
# Create a StringIndexer object, specifying the input and output columns
hamOrSpam = StringIndexer(inputCol='class', outputCol='label')

# Fit the StringIndexer model on the 'data_idf' DataFrame
data_index_fit = hamOrSpam.fit(data_idf)

# Transform the 'data_idf' DataFrame using the fitted StringIndexer model
data_index_transform = data_index_fit.transform(data_idf)

# Select and display the 'label' and 'idf' columns from the resulting DataFrame
data_index_transform.select('label','idf').show()

+-----+--------------------+
|label|                 idf|
+-----+--------------------+
|  0.0|(13423,[7,11,31,6...|
|  0.0|(13423,[0,24,301,...|
|  1.0|(13423,[2,13,19,3...|
|  0.0|(13423,[0,70,80,1...|
|  0.0|(13423,[36,134,31...|
|  1.0|(13423,[10,60,140...|
|  0.0|(13423,[10,53,102...|
|  0.0|(13423,[127,185,4...|
|  1.0|(13423,[1,47,121,...|
|  1.0|(13423,[0,1,13,27...|
|  0.0|(13423,[18,43,117...|
|  1.0|(13423,[8,16,37,8...|
|  1.0|(13423,[13,30,47,...|
|  0.0|(13423,[39,95,221...|
|  0.0|(13423,[555,1797,...|
|  1.0|(13423,[30,109,11...|
|  0.0|(13423,[82,214,44...|
|  0.0|(13423,[0,2,49,13...|
|  0.0|(13423,[0,74,105,...|
|  1.0|(13423,[4,30,33,5...|
+-----+--------------------+
only showing top 20 rows



In [12]:
# Import the VectorAssembler from PySpark ML
from pyspark.ml.feature import VectorAssembler

# Create a VectorAssembler object, specifying the input columns and the output column
vector = VectorAssembler(inputCols=['idf','lenght'] , outputCol='features')

# Transform the 'data_index_transform' DataFrame using the VectorAssembler
data_vector = vector.transform(data_index_transform)

# Select and display the 'features' column from the resulting DataFrame
data_vector.select('features').show()

+--------------------+
|            features|
+--------------------+
|(13424,[7,11,31,6...|
|(13424,[0,24,301,...|
|(13424,[2,13,19,3...|
|(13424,[0,70,80,1...|
|(13424,[36,134,31...|
|(13424,[10,60,140...|
|(13424,[10,53,102...|
|(13424,[127,185,4...|
|(13424,[1,47,121,...|
|(13424,[0,1,13,27...|
|(13424,[18,43,117...|
|(13424,[8,16,37,8...|
|(13424,[13,30,47,...|
|(13424,[39,95,221...|
|(13424,[555,1797,...|
|(13424,[30,109,11...|
|(13424,[82,214,44...|
|(13424,[0,2,49,13...|
|(13424,[0,74,105,...|
|(13424,[4,30,33,5...|
+--------------------+
only showing top 20 rows



In [13]:
# Split the 'data_vector' DataFrame into training (70%) and testing (30%) datasets
train , test = data_vector.randomSplit([0.7,0.3])

# Import the NaiveBayes classifier from PySpark ML
from pyspark.ml.classification import NaiveBayes

# Create a NaiveBayes model
model = NaiveBayes()

# Fit the NaiveBayes model on the training data
model_fit = model.fit(train)

# Make predictions on the test data using the fitted model
test_result = model_fit.transform(test)

# Show the results of the predictions
test_result.show()

23/09/18 10:32:39 WARN DAGScheduler: Broadcasting large task binary with size 1149.8 KiB
23/09/18 10:32:40 WARN DAGScheduler: Broadcasting large task binary with size 1109.1 KiB
23/09/18 10:32:40 WARN DAGScheduler: Broadcasting large task binary with size 1369.2 KiB


+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|class|                text|lenght|               words|           StopWords|               c_vec|                 idf|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|  ham| &lt;#&gt;  mins ...|    51|[, &lt;#&gt;, , m...|[, &lt;#&gt;, , m...|(13423,[3,6,41,20...|(13423,[3,6,41,20...|  0.0|(13424,[3,6,41,20...|[-296.21333971150...|[1.0,3.9256758039...|       0.0|
|  ham| &lt;DECIMAL&gt; ...|   132|[, &lt;decimal&gt...|[, &lt;decimal&gt...|(13423,[3,84,115,...|(13423,[3,84,115,...|  0.0|(13424,[3,84,115,...|[-875.39176509594...|[1.0,8.9058032888...|       0.0|


23/09/18 10:32:40 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [14]:
# Import the MulticlassClassificationEvaluator from PySpark ML
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a MulticlassClassificationEvaluator object
evaluator_model = MulticlassClassificationEvaluator()

# Evaluate the model's performance on the test data and calculate the accuracy
acc = evaluator_model.evaluate(test_result)

# Display the accuracy
acc

23/09/18 10:32:40 WARN DAGScheduler: Broadcasting large task binary with size 1359.7 KiB


0.9263965631920426