# Formação Spark com Pyspark: o Curso Completo

## Seção 5: Machine Learning com Spark

Curso da Udemy.

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as Func
from pyspark.sql.types import *

from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.classification import DecisionTreeClassifier, NaiveBayes
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, RFormula
from pyspark.ml import Pipeline

import os
import sys
import shutil

os.environ['PYSPARK_PYTHON'] = sys.executable

## Machine Learning com Spark

* spark.mllib
* spark.ml

* ML baseado em RDD está descontinuado
* Implementações todas em DataFrames

Tradicionalmente:
* Variáveis independentes são colunas distintas
* Variável dependente: outra coluna

No spark:
* Normalmente todas as variáveis independentes devem compor uma mesma coluna
* Cria-se um vetor único, que é adicionado em nova coluna no DataFrame
* Machine Learning suporta apenas número
    * Atributos categóricos devem ser transformados (encoding)
    * OneHot Encoding utiliza uma matriz esparsa
* Fórmulas do R
    * R permite definir modelo através de fórmula
    * [variável dependente] ~ [variáveis independentes]
    * Ponto define todos os atributos - variável dependente
    * Spark implementa Rformula
        * Aplica One Hot Encoding e combina variáveis independentes em uma única coluna
* Pipelines
    * Transformer: transforma um DataFrame em outro DataFrame
    * Estimator: fit em DataFrame para produzir um Transformer
    * Pipeline: conecta Transformers e Estimators para produzir modelo
    * Parâmetros: Transformers e Estimators compartilham um API para definir parâmetros

### Preparando Dados para Regressão

In [2]:
# criando sessão do spark
spark = SparkSession.builder \
    .appName("Curso Pyspark") \
    .getOrCreate()

In [3]:
# importando dados - prever potencia de carro (HP)
carros_temp = spark.read.csv('../data/Carros.csv', inferSchema = True, header = True, sep = ';')
carros_temp.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [4]:
carros = carros_temp.select('Consumo', 'Cilindros', 'Cilindradas', 'HP')
carros.show(5)

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
+-------+---------+-----------+---+
only showing top 5 rows



In [5]:
# criando vetor para treinar modelo
veccaracteristicas = VectorAssembler(inputCols = ['Consumo', 'Cilindros', 'Cilindradas'], outputCol = 'caracteristicas')

In [6]:
# transformando DataFrame carros em um vetor
carros_carac = veccaracteristicas.transform(carros)
carros_carac.show(5)

+-------+---------+-----------+---+-----------------+
|Consumo|Cilindros|Cilindradas| HP|  caracteristicas|
+-------+---------+-----------+---+-----------------+
|     21|        6|        160|110| [21.0,6.0,160.0]|
|     21|        6|        160|110| [21.0,6.0,160.0]|
|    228|        4|        108| 93|[228.0,4.0,108.0]|
|    214|        6|        258|110|[214.0,6.0,258.0]|
|    187|        8|        360|175|[187.0,8.0,360.0]|
+-------+---------+-----------+---+-----------------+
only showing top 5 rows



In [7]:
# separacao treino teste
carros_treino, carros_teste = carros_carac.randomSplit([0.7, 0.3], seed = 24)
print(carros_treino.count())
print(carros_teste.count())

23
9


### Regressão Linear

In [8]:
# criando e treinando modelo de regressão linear
reglin = LinearRegression(featuresCol = 'caracteristicas', labelCol = 'HP')
modelo = reglin.fit(carros_treino)

In [9]:
# predict dos dados de teste
previsao = modelo.transform(carros_teste)
previsao.show(5)

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|    143|        8|        360|245| [143.0,8.0,360.0]|211.74015872646999|
|    152|        8|        304|150| [152.0,8.0,304.0]|211.17070861674188|
|    164|        8|       2758|180|[164.0,8.0,2758.0]|203.57816690948073|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|202.85738669399979|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|138.76957605968335|
+-------+---------+-----------+---+------------------+------------------+
only showing top 5 rows



### Avaliação da Regressão Linear

In [10]:
# avaliando performance do modelo
avaliar = RegressionEvaluator(predictionCol = 'prediction', labelCol ='HP', metricName = 'rmse')
rmse = avaliar.evaluate(previsao)
print(rmse)

32.33357364996903


### Random Forest Regressor

In [11]:
# treinando random forest
rfreg = RandomForestRegressor(featuresCol = 'caracteristicas', labelCol = 'HP')
modelo2 = rfreg.fit(carros_treino)

In [12]:
# predict dos dados de teste
previsao2 = modelo2.transform(carros_teste)
previsao2.show(5)

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|    143|        8|        360|245| [143.0,8.0,360.0]|215.63652777777776|
|    152|        8|        304|150| [152.0,8.0,304.0]|236.51748015873017|
|    164|        8|       2758|180|[164.0,8.0,2758.0]|190.35666666666668|
|    173|        8|       2758|180|[173.0,8.0,2758.0]| 187.5066666666667|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|116.34166666666667|
+-------+---------+-----------+---+------------------+------------------+
only showing top 5 rows



### Avaliação da Random Forest Regressor

In [13]:
# avaliando performance do modelo
rmse = avaliar.evaluate(previsao2)
print(rmse)

39.92372972016067


### Preparando dados para Classificação

In [14]:
# importando dados - prever churn de clientes (Exited)
churn = spark.read.csv('../data/Churn.csv', inferSchema = True, header = True, sep = ';')
churn.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [15]:
# criando RFormula para modelo
formula = RFormula(formula = 'Exited ~ .', featuresCol = 'features', labelCol = 'label', handleInvalid = 'skip')

In [16]:
# criando transformacao para treinar modelo
churn_trans = formula.fit(churn).transform(churn).select('features', 'label')
churn_trans.show(5, truncate = False)

+--------------------------------------------------------------+-----+
|features                                                      |label|
+--------------------------------------------------------------+-----+
|[619.0,1.0,0.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,1.0134888E7]      |1.0  |
|[608.0,0.0,0.0,0.0,41.0,1.0,8380786.0,1.0,0.0,1.0,1.1254258E7]|0.0  |
|[502.0,1.0,0.0,0.0,42.0,8.0,1596608.0,3.0,1.0,0.0,1.1393157E7]|1.0  |
|(11,[0,1,4,5,7,10],[699.0,1.0,39.0,1.0,2.0,9382663.0])        |0.0  |
|[850.0,0.0,0.0,0.0,43.0,2.0,1.2551082E7,1.0,1.0,1.0,790841.0] |0.0  |
+--------------------------------------------------------------+-----+
only showing top 5 rows



In [17]:
# separacao treino teste
churn_treino, churn_teste = churn_trans.randomSplit([0.7, 0.3], seed = 24)
print(churn_treino.count())
print(churn_teste.count())

6982
3018


### Árvore de Decisão

In [18]:
# criando e treinando modelo de árvore de decisão
dt = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'features')
modelo = dt.fit(churn_treino)

In [19]:
# predict dos dados de teste
previsao = modelo.transform(churn_teste)
previsao.show(5)

+--------------------+-----+--------------+--------------------+----------+
|            features|label| rawPrediction|         probability|prediction|
+--------------------+-----+--------------+--------------------+----------+
|(11,[0,1,4,5,7,10...|  0.0|[4637.0,606.0]|[0.88441731832920...|       0.0|
|(11,[0,1,4,5,7,10...|  0.0|[4637.0,606.0]|[0.88441731832920...|       0.0|
|(11,[0,1,4,5,7,10...|  0.0|[4637.0,606.0]|[0.88441731832920...|       0.0|
|(11,[0,1,4,5,7,10...|  0.0|[4637.0,606.0]|[0.88441731832920...|       0.0|
|(11,[0,1,4,5,7,10...|  0.0|[4637.0,606.0]|[0.88441731832920...|       0.0|
+--------------------+-----+--------------+--------------------+----------+
only showing top 5 rows



### Avaliação da Árvore de Decisão

In [20]:
# avaliando performance do modelo
avaliar = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol ='label', metricName = 'areaUnderROC')
auc_roc = avaliar.evaluate(previsao)
print(auc_roc)

0.6912937265638351


### Pipelines

In [22]:
# criando pipeline
pipeline = Pipeline(stages = [veccaracteristicas, reglin])
pipelineModel = pipeline.fit(carros)

In [25]:
# previsao a partir da pipeline
previsao = pipelineModel.transform(carros)
previsao.show(5)

+-------+---------+-----------+---+-----------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|  caracteristicas|        prediction|
+-------+---------+-----------+---+-----------------+------------------+
|     21|        6|        160|110| [21.0,6.0,160.0]|162.32154816816646|
|     21|        6|        160|110| [21.0,6.0,160.0]|162.32154816816646|
|    228|        4|        108| 93|[228.0,4.0,108.0]| 82.51715587712931|
|    214|        6|        258|110|[214.0,6.0,258.0]|141.86680518718754|
|    187|        8|        360|175|[187.0,8.0,360.0]|202.93528239714834|
+-------+---------+-----------+---+-----------------+------------------+
only showing top 5 rows



### Faça Você Mesmo - Iris

1. Classe é a coluna class, portando Multiclass
2. Utilize MulticlassClassificationEvaluator
3. Use accuracy como métrica
4. Use um classificador diferente, como Naive Bayes

In [27]:
# importando dados
iris = spark.read.csv('../data/iris.csv', inferSchema = True, header = True)
iris.show(5)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [28]:
# RFormula
formula = RFormula(formula = 'class ~ .', featuresCol = 'features', labelCol = 'label', handleInvalid = 'skip')

In [29]:
# transformar dataframe
iris_trans = formula.fit(iris).transform(iris).select('features', 'label')
iris_trans.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
+-----------------+-----+
only showing top 5 rows



In [34]:
# separação treino teste
iris_treino, iris_teste = iris_trans.randomSplit([0.7, 0.3], seed = 24)
print(iris_treino.count())
print(iris_teste.count())

109
41


In [31]:
# instanciando e treinando modelo
nb = NaiveBayes(labelCol = 'label', featuresCol = 'features')
modelo = nb.fit(iris_treino)

In [32]:
# previsao
previsao = modelo.transform(iris_teste)
previsao.show(5)

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.6,3.4,1.4,0.3]|  0.0|[-11.837924235146...|[0.69884141410680...|       0.0|
|[4.7,3.2,1.3,0.2]|  0.0|[-11.138653607079...|[0.72109386097880...|       0.0|
|[4.8,3.1,1.6,0.2]|  0.0|[-11.680129423086...|[0.66513735163144...|       0.0|
|[4.8,3.4,1.6,0.2]|  0.0|[-12.007508953877...|[0.70317499282180...|       0.0|
|[4.9,3.0,1.4,0.2]|  0.0|[-11.255581098090...|[0.68938763985519...|       0.0|
+-----------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [33]:
# validação do modelo
avaliar = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'label', metricName = 'accuracy')
resultado = avaliar.evaluate(previsao)
print(resultado)

0.975609756097561
