###Prof. Fernando Amaral https://www.eia.ai/

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import Tokenizer, StringIndexer, Word2Vec
spark = SparkSession.builder.appName("nlp").getOrCreate()

In [None]:
spam = spark.sql("select * from spam")

In [None]:
spam.show(5)

+--------+--------------------+
|Category|             Message|
+--------+--------------------+
|     ham|Go until jurong p...|
|     ham|Ok lar... Joking ...|
|    spam|Free entry in 2 a...|
|     ham|U dun say so earl...|
|     ham|Nah I don't think...|
+--------+--------------------+
only showing top 5 rows



In [None]:
stringmodel = StringIndexer(inputCol="Category",outputCol="CategoryIndex")
spamnew = stringmodel.fit(spam).transform(spam)
spamnew.show(5)

+--------+--------------------+-------------+
|Category|             Message|CategoryIndex|
+--------+--------------------+-------------+
|     ham|Go until jurong p...|          0.0|
|     ham|Ok lar... Joking ...|          0.0|
|    spam|Free entry in 2 a...|          1.0|
|     ham|U dun say so earl...|          0.0|
|     ham|Nah I don't think...|          0.0|
+--------+--------------------+-------------+
only showing top 5 rows



In [None]:
tokens = Tokenizer(inputCol="Message", outputCol="MessageToken")
spamtoken = tokens.transform(spamnew)

+--------+--------------------+-------------+--------------------+
|Category|             Message|CategoryIndex|        MessageToken|
+--------+--------------------+-------------+--------------------+
|     ham|Go until jurong p...|          0.0|[go, until, juron...|
|     ham|Ok lar... Joking ...|          0.0|[ok, lar..., joki...|
|    spam|Free entry in 2 a...|          1.0|[free, entry, in,...|
|     ham|U dun say so earl...|          0.0|[u, dun, say, so,...|
|     ham|Nah I don't think...|          0.0|[nah, i, don't, t...|
+--------+--------------------+-------------+--------------------+
only showing top 5 rows



In [None]:
spamtoken.select("MessageToken").show(5)

+--------------------+
|        MessageToken|
+--------------------+
|[go, until, juron...|
|[ok, lar..., joki...|
|[free, entry, in,...|
|[u, dun, say, so,...|
|[nah, i, don't, t...|
+--------------------+
only showing top 5 rows



In [None]:
word2vec = Word2Vec(inputCol="MessageToken", outputCol="Messagew2v")
spamresult = word2vec.fit(spamtoken).transform(spamtoken)
spamresult.show(5)

+--------+--------------------+-------------+--------------------+--------------------+
|Category|             Message|CategoryIndex|        MessageToken|          Messagew2v|
+--------+--------------------+-------------+--------------------+--------------------+
|     ham|Go until jurong p...|          0.0|[go, until, juron...|[8.76827139290981...|
|     ham|Ok lar... Joking ...|          0.0|[ok, lar..., joki...|[0.03173843957483...|
|    spam|Free entry in 2 a...|          1.0|[free, entry, in,...|[-0.0297333014397...|
|     ham|U dun say so earl...|          0.0|[u, dun, say, so,...|[0.04455647329715...|
|     ham|Nah I don't think...|          0.0|[nah, i, don't, t...|[0.05425926097310...|
+--------+--------------------+-------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
spamresult.select("Messagew2v").show(5)

+--------------------+
|          Messagew2v|
+--------------------+
|[8.76827139290981...|
|[0.03173843957483...|
|[-0.0297333014397...|
|[0.04455647329715...|
|[0.05425926097310...|
+--------------------+
only showing top 5 rows



In [None]:
spamTreino,spamTeste = spamresult.randomSplit([0.7,0.3])

In [None]:
rf = RandomForestClassifier(labelCol="CategoryIndex", featuresCol="Messagew2v", numTrees=500)
modelo = rf.fit(spamTreino)

In [None]:
previsoes = modelo.transform(spamTeste)

In [None]:
previsoes.show(10)

+--------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|Category|             Message|CategoryIndex|        MessageToken|          Messagew2v|       rawPrediction|         probability|prediction|
+--------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|     ham|&lt;#&gt;  am I t...|          0.0|[&lt;#&gt;, , am,...|[0.03531881545980...|[487.385576611302...|[0.97477115322260...|       0.0|
|     ham|&lt;#&gt;  great ...|          0.0|[&lt;#&gt;, , gre...|[0.02178961889252...|[486.883007131753...|[0.97376601426350...|       0.0|
|     ham|&lt;#&gt;  in mca...|          0.0|[&lt;#&gt;, , in,...|[0.03211816160806...|[487.350223608614...|[0.97470044721722...|       0.0|
|     ham|(And my man carlo...|          0.0|[(and, my, man, c...|[0.00276377212139...|[487.287068146138...|[0.97457413629227...|       0.0|
|     ham|(I 

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
avaliar = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="CategoryIndex", metricName="areaUnderROC")
areaUnderRoc = avaliar.evaluate(previsoes)
print(areaUnderRoc)

0.8606955197507615
