In [1]:
import pyspark
from pyspark.sql import SparkSession
# from pyspark.ml.feature import 
import pyspark.sql.functions as sqlf


In [2]:
spark = SparkSession.builder.appName("app").getOrCreate()

In [3]:
df = spark.read.csv("./Python-and-Spark-for-Big-Data-master/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Natural_Language_Processing/smsspamcollection/SMSSpamCollection",
              sep = "\t")

In [4]:
df.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [5]:
df.count()

5574

In [8]:
df = df.withColumnRenamed("_c0", "class").withColumnRenamed("_c1", "text")

In [9]:
df.columns

['class', 'text']

In [10]:
df = df.withColumn("length", sqlf.length("text"))

In [11]:
df.show(3)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
+-----+--------------------+------+
only showing top 3 rows



In [13]:
df.groupBy("class").mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [14]:
from pyspark.ml.feature import Tokenizer, CountVectorizer, StopWordsRemover, IDF, StringIndexer

In [15]:
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")

In [16]:
stop_remover = StopWordsRemover(inputCol="token_text", outputCol="stop_token")

In [17]:
count_vectorizer = CountVectorizer(inputCol="stop_token", outputCol="c_vec")

In [18]:
idf = IDF(inputCol="c_vec", outputCol="tf_idf")

In [19]:
string_to_int = StringIndexer(inputCol="class", outputCol="label")

In [20]:
from pyspark.ml.feature import VectorAssembler

In [21]:
v_ass = VectorAssembler(inputCols=["tf_idf", "length"], outputCol="features")

In [22]:
from pyspark.ml.classification import NaiveBayes

In [34]:
nb = NaiveBayes()

In [35]:
from pyspark.ml import Pipeline

In [36]:
data_pipeline = Pipeline(stages=[string_to_int, tokenizer, stop_remover, count_vectorizer, idf, v_ass])

In [37]:
cleaned = data_pipeline.fit(df)

In [38]:
cleaned_data = cleaned.transform(df)

In [41]:
cleaned_data = cleaned_data.select(["label", "features"])

In [42]:
train, test = cleaned_data.randomSplit([.8, .2])

In [44]:
nb_model = nb.fit(train)

In [45]:
predictions = nb_model.transform(test)

In [46]:
predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,3,9,1...|[-575.45234769703...|[0.99999999999998...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-885.90035926573...|[1.0,4.8545852319...|       0.0|
|  0.0|(13424,[0,1,24,31...|[-356.50587080003...|[1.0,8.9972221107...|       0.0|
|  0.0|(13424,[0,1,30,11...|[-596.44929890683...|[1.0,1.9929717097...|       0.0|
|  0.0|(13424,[0,1,31,43...|[-342.38938988104...|[1.0,4.1904217075...|       0.0|
|  0.0|(13424,[0,1,43,69...|[-619.13193968306...|[0.99994496480073...|       0.0|
|  0.0|(13424,[0,1,150,1...|[-252.88099401747...|[0.88900677148327...|       0.0|
|  0.0|(13424,[0,2,3,8,2...|[-1613.2448311084...|[1.0,3.2608550113...|       0.0|
|  0.0|(13424,[0,2,4,10,...|[-1237.8001121058...|[1.0,6.1924158009...|       0.0|
|  0.0|(13424,[0

In [48]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [49]:
evalu = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")

In [53]:
evalu.evaluate(predictions)

0.9296014645489316