In [1]:
import os
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
sc=SparkContext()

spark=SparkSession(sc)

In [3]:
path = os.getcwd()
rdd_nospam_train=sc.textFile(path + "/data/nospam_training.txt")
rdd_spam_train=sc.textFile(path + "/data/spam_training.txt")
rdd_nospam_test=sc.textFile(path + "/data/nospam_testing.txt")
rdd_spam_test=sc.textFile(path + "/data/spam_testing.txt")



#for spam and no spam datasets: split each line of the text by
#looking for any number of whitespaces between actual words
#also add a label for each sample of the datasets
rdd_nospam_train=rdd_nospam_train.map(lambda x: Row(label=0, text=x.split()))
df_nospam=spark.createDataFrame(rdd_nospam_train)

rdd_spam_train=rdd_spam_train.map(lambda x: Row(label=1, text=x.split()))
df_spam=spark.createDataFrame(rdd_spam_train)

rdd_nospam_test=rdd_nospam_test.map(lambda x: Row(label=0, text=x.split()))
df_test_nospam=spark.createDataFrame(rdd_nospam_test)

rdd_spam_test=rdd_spam_test.map(lambda x: Row(label=1, text=x.split()))
df_test_spam=spark.createDataFrame(rdd_spam_test)

#perform a union of the two datsets
df= df_nospam.union(df_spam)
df_test= df_test_nospam.union(df_test_spam)


In [4]:
#we create our CountVectorizerModel with the training data
cv = CountVectorizer(inputCol = "text", outputCol = "features", vocabSize = 1000)
model =cv.fit(df)



In [5]:
# we apply the CountVectorizerModel to our training DataFrame, so that for every Row we get the term frequency of relevant words
result= model.transform(df)
result.collect()
# we apply the same CountVectorizerModel obtained through the training set, to our test DataFrame
result_test=model.transform(df_test)

In [6]:
#We create the InverseDoucmentFrequency Model with our training set
idf = IDF(inputCol="features", outputCol = "idf_features")
idfModel = idf.fit(result)

#we apply the InverseDocumentFrequency Model to our trainingset
r= idfModel.transform(result)

#we apply the InverseDocumentFrequency Model to our test set
r_test= idfModel.transform(result_test)


In [7]:
lr = LogisticRegression(featuresCol="idf_features")

# Fit the model
lrModel = lr.fit(r)

In [8]:
r_test1=lrModel.transform(r_test)

In [9]:
rdd_r_test1=r_test1.rdd.map(lambda x: (x["prediction"], float(x["label"])))

In [10]:
# compute the error of our predictions

text_file = open("spamfilter_evaluation.txt", "w")
text_file.write("Confusion Matrix:\n")
text_file.write(str(MulticlassMetrics(rdd_r_test1).confusionMatrix().toArray()[:,0])+"\n")
text_file.write(str(MulticlassMetrics(rdd_r_test1).confusionMatrix().toArray()[:,1])+"\n")

text_file.write("The rows are true labels and the columns predictions. (First Row: Spam, Second Row: No-Spam. Columns ordered analogous)\n")
text_file.write("\n")

text_file.write("Recall Spam:" + str(MulticlassMetrics(rdd_r_test1).recall(label=1)) +"\n")
text_file.write("Recall No-Spam:" + str(MulticlassMetrics(rdd_r_test1).recall(label=0))+"\n")
text_file.write("Precision Spam:" + str(MulticlassMetrics(rdd_r_test1).precision(label=1))+"\n")
text_file.write("Precision No-Spam:" + str(MulticlassMetrics(rdd_r_test1).precision(label=0)))

text_file.close()

In [11]:
print("Confusion Matrix:")
print(MulticlassMetrics(rdd_r_test1).confusionMatrix().toArray())
print("The rows are true labels and the columns predictions. (First Row: Spam, Second Row: No-Spam. Columns ordered analogous)")


Confusion Matrix:
[[924.  41.]
 [ 24. 125.]]
The rows are true labels and the columns predictions. (First Row: Spam, Second Row: No-Spam. Columns ordered analogous)


In [12]:
print("Recall Spam:" + str(MulticlassMetrics(rdd_r_test1).recall(label=1)))
print("Recall No-Spam:" + str(MulticlassMetrics(rdd_r_test1).recall(label=0)))

print("Precision Spam:" + str(MulticlassMetrics(rdd_r_test1).precision(label=1)))
print("Precision No-Spam:" + str(MulticlassMetrics(rdd_r_test1).precision(label=0)))


Recall Spam:0.8389261744966443
Recall No-Spam:0.9575129533678757
Precision Spam:0.7530120481927711
Precision No-Spam:0.9746835443037974
