### Prof. Fernando Amaral - www.eia.ai
#### Machine Learning com Spark
##### LogisticRegression 

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("logisticClass").getOrCreate()

In [2]:
churn = spark.read.csv("Churn.csv", header=True, inferSchema=True, sep=";")
churn.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [3]:
from pyspark.ml.feature import RFormula
Rformula = RFormula(formula="Exited ~ .", featuresCol="independente", labelCol="dependente")
churnrf = Rformula.fit(churn).transform(churn)
churnrf.select("independente","dependente").show(5, truncate=False)

+--------------------------------------------------------------+----------+
|independente                                                  |dependente|
+--------------------------------------------------------------+----------+
|[619.0,1.0,0.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,1.0134888E7]      |1.0       |
|[608.0,0.0,0.0,0.0,41.0,1.0,8380786.0,1.0,0.0,1.0,1.1254258E7]|0.0       |
|[502.0,1.0,0.0,0.0,42.0,8.0,1596608.0,3.0,1.0,0.0,1.1393157E7]|1.0       |
|(11,[0,1,4,5,7,10],[699.0,1.0,39.0,1.0,2.0,9382663.0])        |0.0       |
|[850.0,0.0,0.0,0.0,43.0,2.0,1.2551082E7,1.0,1.0,1.0,790841.0] |0.0       |
+--------------------------------------------------------------+----------+
only showing top 5 rows



In [4]:
churnTreino, churnTeste = churnrf.randomSplit([0.8,0.2])
print(churnTreino.count())
print(churnTeste.count())

8027
1973


In [5]:
from pyspark.ml.classification import LogisticRegression
logistic = LogisticRegression(featuresCol="independente", labelCol="dependente", maxIter=100, regParam=0.08)
modelo = logistic.fit(churnTreino)

In [6]:
resumo = modelo.summary

acuracia = resumo.accuracy
precisao = resumo.weightedPrecision
recall = resumo.weightedRecall
auc = resumo.areaUnderROC

print("Acurácia: ", acuracia, "\nPrecisão: ", precisao, "\nRecall: ", recall, "\nAUC ", auc)

Acurácia:  0.8050330135791703 
Precisão:  0.7833896682456508 
Recall:  0.8050330135791703 
AUC  0.761732093905445


In [7]:
previsao = modelo.transform(churnTeste)
previsao.select("dependente","prediction","probability","rawPrediction").show(5,truncate=False)

+----------+----------+----------------------------------------+------------------------------------------+
|dependente|prediction|probability                             |rawPrediction                             |
+----------+----------+----------------------------------------+------------------------------------------+
|1.0       |0.0       |[0.7017715116470331,0.2982284883529669] |[0.8557479369103973,-0.8557479369103973]  |
|1.0       |0.0       |[0.59555718690573,0.40444281309427]     |[0.38698706620395074,-0.38698706620395074]|
|1.0       |0.0       |[0.8006200468298809,0.19937995317011914]|[1.3901741692855105,-1.3901741692855105]  |
|1.0       |0.0       |[0.5782981227127766,0.4217018772872234] |[0.31579090648676766,-0.31579090648676766]|
|1.0       |0.0       |[0.8268931798551983,0.17310682014480172]|[1.5637666594132233,-1.5637666594132233]  |
+----------+----------+----------------------------------------+------------------------------------------+
only showing top 5 rows



In [8]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
avaliar = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="dependente", metricName="areaUnderROC")
areaUnderRoc = avaliar.evaluate(previsao)
print(areaUnderRoc)


0.784388011896589
